linux2.4.0版本内核代码fork.c浅显分析

来源：cnblogs　　作者：wxy1567　　时间：2021/5/31 9:02:18　　对本文有异议

结合fork.c文件分析进程创建的过程

本文为作业任务，只做浅显的分析，为大家提供一个分析的思路，很多细节都没有展示。如果想要更详细的分析请去搜索相关函数代码，博客园内有许多有用的信息供大家学习。

int nr_threads;
int nr_running;
int max_threads;
unsigned long total_forks;    /* Handle normal Linux uptimes. */
int last_pid;
struct task_struct *pidhash[PIDHASH_SZ];

文件开头定义了线程数量，进程数量，最大线程数，创建的进程总个数，最新的pid号以及存放pid号的哈希表。

void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
    unsigned long flags;
    wq_write_lock_irqsave(&q->lock, flags);
    wait->flags = 0;
    __add_wait_queue(q, wait);
    wq_write_unlock_irqrestore(&q->lock, flags);
}
void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
wq_write_lock_irqsave(&q->lock, flags);
wait->flags = WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(q, wait);
wq_write_unlock_irqrestore(&q->lock, flags);
}
void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
    unsigned long flags;
    wq_write_lock_irqsave(&q->lock, flags);
    __remove_wait_queue(q, wait);
    wq_write_unlock_irqrestore(&q->lock, flags);
}

这部分代码与进程的等待队列有关。Linux内核的等待队列是以双循环链表为基础数据结构，与进程调度机制紧密结合，能够用于实现核心的异步事件通知机制。等待队列在include/linux/wait.h中，这是一个通过list_head连接的典型双循环链表，在这个链表中，有两种数据结构：等待队列头（wait_queue_head_t）和等待队列项（wait_queue_t）。等待队列头和等待队列项中都包含一个list_head类型的域作为"连接件"。由于我们只需要对队列进行添加和删除操作，并不会修改其中的对象（等待队列项），因此，我们只需要提供一把保护整个基础设施和所有对象的锁，这把锁保存在等待队列头中，为wq_lock_t类型。在实现中，可以支持读写锁（rwlock）或自旋锁（spinlock）两种类型，通过一个宏定义来切换。如果使用读写锁，将wq_lock_t定义为rwlock_t类型；如果是自旋锁，将wq_lock_t定义为spinlock_t类型。无论哪种情况，分别相应设置wq_read_lock、wq_read_unlock、wq_read_lock_irqsave、wq_read_unlock_irqrestore、wq_write_lock_irq、wq_write_unlock、wq_write_lock_irqsave和wq_write_unlock_irqrestore等宏。在__wait_queue 中定义的WQ_FLAG_EXCLUSIVE表示节点对应的进程对临界资源具有排他性。remove_wait_queue函数用于将等待队列项wait从以q为等待队列头的等待队列中移除

void __init fork_init(unsigned long mempages)
{
    /*
     * The default maximum number of threads is set to a safe
     * value: the thread structures can take up at most half
     * of memory.
     */
    max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 2;
 
    init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
    init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
}

如注释所说，默认的最大线程数被设置为一个安全值:线程结构最多可以占用一半的内存。__init在include/linux/wait.h中，作用为将带有__init标识符的函数划分到.init.text段中，此段只在启动时做一次初始化载入。

/* Protects next_safe and last_pid. */
spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
 
static int get_pid(unsigned long flags)
{
    static int next_safe = PID_MAX;
    struct task_struct *p;
 
    if (flags & CLONE_PID)
        return current->pid;
 
    spin_lock(&lastpid_lock);
    if((++last_pid) & 0xffff8000) {
        last_pid = 300;      /* Skip daemons etc. */
 
        goto inside;
    }
    if(last_pid >= next_safe) {
inside:
        next_safe = PID_MAX;
        read_lock(&tasklist_lock);
    repeat:
        for_each_task(p) {
            if(p->pid == last_pid ||
               p->pgrp == last_pid ||
               p->session == last_pid) {
                if(++last_pid >= next_safe) {
                    if(last_pid & 0xffff8000)
                        last_pid = 300;
                    next_safe = PID_MAX;
                }
                goto repeat;
            }
            if(p->pid > last_pid && next_safe > p->pid)
                next_safe = p->pid;
            if(p->pgrp > last_pid && next_safe > p->pgrp)
                next_safe = p->pgrp;
            if(p->session > last_pid && next_safe > p->session)
                next_safe = p->session;
        }
        read_unlock(&tasklist_lock);
    }
    spin_unlock(&lastpid_lock);
 
    return last_pid;
}

这部分代码用来给进程分配pid，对get_pid函数添加自旋锁保证函数的运行，对tasklist_lock添加读锁，确保pid数据安全。last_pid用于记录上一次分配给进程时的pid值。分配的pid一般而言是last_pid+1，如果超出进程个数的最大值（0xffff8000），那么进程pid值从300开始重新查找未用的。也就是说，一般用户进程的pid值范围[300，ffff8000]。（0~299,留给系统）。变量next_safe的含义是，在[last_pid,next_safe]之间，都是没有使用过的pid，一旦last_pid+1大于了next_safe，也就是说pid值进入了不可靠空间，有可能这个值被使用，这时需要遍历task来确认。这样遍历task找到一个没有用过的pid，同时确定next_safe，以保证next_safe到last_pid的区间中pid是空闲的，这样只要再次分配pid时，其值小于next_safe就可以直接分配，而不需要遍历task来查找空闲的pid。

static inline int dup_mmap(struct mm_struct * mm)
{
    struct vm_area_struct * mpnt, *tmp, **pprev;
    int retval;
 
    flush_cache_mm(current->mm);
    mm->locked_vm = 0;
    mm->mmap = NULL;
    mm->mmap_avl = NULL;
    mm->mmap_cache = NULL;
    mm->map_count = 0;
    mm->cpu_vm_mask = 0;
    mm->swap_cnt = 0;
    mm->swap_address = 0;
    pprev = &mm->mmap;
    for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
        struct file *file;
 
        retval = -ENOMEM;
        if(mpnt->vm_flags & VM_DONTCOPY)
            continue;
        tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
        if (!tmp)
            goto fail_nomem;
        *tmp = *mpnt;
        tmp->vm_flags &= ~VM_LOCKED;
        tmp->vm_mm = mm;
        mm->map_count++;
        tmp->vm_next = NULL;
        file = tmp->vm_file;
        if (file) {
            struct inode *inode = file->f_dentry->d_inode;
            get_file(file);
            if (tmp->vm_flags & VM_DENYWRITE)
                atomic_dec(&inode->i_writecount);
     
            /* insert tmp into the share list, just after mpnt */
            spin_lock(&inode->i_mapping->i_shared_lock);
            if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
                mpnt->vm_next_share->vm_pprev_share =
 
                    &tmp->vm_next_share;
            mpnt->vm_next_share = tmp;
            tmp->vm_pprev_share = &mpnt->vm_next_share;
            spin_unlock(&inode->i_mapping->i_shared_lock);
        }
 
        /* Copy the pages, but defer checking for errors */
        retval = copy_page_range(mm, current->mm, tmp);
        if (!retval && tmp->vm_ops && tmp->vm_ops->open)
            tmp->vm_ops->open(tmp);
 
        /*
         * Link in the new vma even if an error occurred,
         * so that exit_mmap() can clean up the mess.
         */
 
        *pprev = tmp;
        pprev = &tmp->vm_next;
 
        if (retval)
            goto fail_nomem;
    }
    retval = 0;
    if (mm->map_count >= AVL_MIN_MAP_COUNT)
        build_mmap_avl(mm);
 
fail_nomem:
    flush_tlb_mm(current->mm);
    return retval;
}
 
spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;
 
#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 
#define free_mm(mm)  (kmem_cache_free(mm_cachep, (mm)))
 
 
 
static struct mm_struct * mm_init(struct mm_struct * mm)
{
    atomic_set(&mm->mm_users, 1);
    atomic_set(&mm->mm_count, 1);
    init_MUTEX(&mm->mmap_sem);
    mm->page_table_lock = SPIN_LOCK_UNLOCKED;
    mm->pgd = pgd_alloc();
    if (mm->pgd)
        return mm;
    free_mm(mm);
    return NULL;
}
   
 
/*
 * Allocate and initialize an mm_struct.
 */
 
struct mm_struct * mm_alloc(void)
{
    struct mm_struct * mm;
 
    mm = allocate_mm();
    if (mm) {
        memset(mm, 0, sizeof(*mm));
        return mm_init(mm);
    }
    return NULL;
}
 
/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
inline void __mmdrop(struct mm_struct *mm)
{
    if (mm == &init_mm) BUG();
    pgd_free(mm->pgd);
    destroy_context(mm);
    free_mm(mm);
}
 
/*
 * Decrement the use count and release all resources for an mm.
 */
 
void mmput(struct mm_struct *mm)
{
    if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
        list_del(&mm->mmlist);
        spin_unlock(&mmlist_lock);
        exit_mmap(mm);
        mmdrop(mm);
    }
}
 
void mm_release(void)
{
    struct task_struct *tsk = current;
 
    /* notify parent sleeping on vfork() */
 
    if (tsk->flags & PF_VFORK) {
        tsk->flags &= ~PF_VFORK;
        up(tsk->p_opptr->vfork_sem);
    }
}

这部分代码为内存管理部分，代码中的注释向我们大致说明了本段代码的功能。

Linux内核通过一个被称为进程描述符的task_struct结构体来管理进程，这个结构体包含了一个进程所需的所有信息。它定义在include/linux/sched.h文件中。每一个进程都会有自己独立的mm_struct，这样每一个进程都会有自己独立的地址空间，这样才能互不干扰。在地址空间中，mmap为地址空间的内存区域（用vm_area_struct结构来表示）链表，表示起来更加方便。mm_struct的结构描述了进程的用户空间的结构，定义了用户空间的段分布：数据段，代码段，堆栈段。其中pgd_t是该进程用户空间地址映射到物理地址时使用vm_area_struct是进程用户空间已映射到物理空间的虚拟地址区间，定义在/include/linux/mm.h。mmap是该空间区块组成的链表。vm_flag是描述对虚拟区间的操作的标志。

static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
    struct mm_struct * mm, *oldmm;
    int retval;
 
    tsk->min_flt = tsk->maj_flt = 0;
    tsk->cmin_flt = tsk->cmaj_flt = 0;
    tsk->nswap = tsk->cnswap = 0;
 
    tsk->mm = NULL;
    tsk->active_mm = NULL;
 
    /*
     * Are we cloning a kernel thread?
     *
     * We need to steal a active VM for that..
     */
    oldmm = current->mm;
    if (!oldmm)
        return 0;
 
    if (clone_flags & CLONE_VM) {
        atomic_inc(&oldmm->mm_users);
        mm = oldmm;
        goto good_mm;
    }
 
    retval = -ENOMEM;
    mm = allocate_mm();
    if (!mm)
        goto fail_nomem;
 
    /* Copy the current MM stuff.. */
    memcpy(mm, oldmm, sizeof(*mm));
    if (!mm_init(mm))
        goto fail_nomem;
 
    down(&oldmm->mmap_sem);
    retval = dup_mmap(mm);
    up(&oldmm->mmap_sem);
 
    /*
     * Add it to the mmlist after the parent.
     *
     * Doing it this way means that we can order
     * the list, and fork() won't mess up the
     * ordering significantly.
     */
    spin_lock(&mmlist_lock);
    list_add(&mm->mmlist, &oldmm->mmlist);
    spin_unlock(&mmlist_lock);
 
    if (retval)
        goto free_pt;
 
    /*
     * child gets a private LDT (if there was an LDT in the parent)
     */
    copy_segments(tsk, mm);
 
    if (init_new_context(tsk,mm))
        goto free_pt;
 
good_mm:
    tsk->mm = mm;
    tsk->active_mm = mm;
    return 0;
 
free_pt:
    mmput(mm);
fail_nomem:
    return retval;
}
 
static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
{
    struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
    /* We don't need to lock fs - think why ;-) */
 
    if (fs) {
        atomic_set(&fs->count, 1);
        fs->lock = RW_LOCK_UNLOCKED;
        fs->umask = old->umask;
        read_lock(&old->lock);
        fs->rootmnt = mntget(old->rootmnt);
        fs->root = dget(old->root);
        fs->pwdmnt = mntget(old->pwdmnt);
        fs->pwd = dget(old->pwd);
        if (old->altroot) {
            fs->altrootmnt = mntget(old->altrootmnt);
            fs->altroot = dget(old->altroot);
        } else {
            fs->altrootmnt = NULL;
            fs->altroot = NULL;
        }  
        read_unlock(&old->lock);
    }
    return fs;
}
 
struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
    return __copy_fs_struct(old);
}
 
static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
{
    if (clone_flags & CLONE_FS) {
        atomic_inc(&current->fs->count);
        return 0;
    }
    tsk->fs = __copy_fs_struct(current->fs);
    if (!tsk->fs)
        return -1;
    return 0;
}
 
static int count_open_files(struct files_struct *files, int size)
{
    int i;
   
    /* Find the last open fd */
 
    for (i = size/(8*sizeof(long)); i > 0; ) {
        if (files->open_fds->fds_bits[--i])
            break;
    }
    i = (i+1) * 8 * sizeof(long);
    return i;
}
 
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
    struct files_struct *oldf, *newf;
    struct file **old_fds, **new_fds;
    int open_files, nfds, size, i, error = 0;
 
    /*
     * A background process may not have any files ...
     */
    oldf = current->files;
    if (!oldf)
        goto out;
 
    if (clone_flags & CLONE_FILES) {
        atomic_inc(&oldf->count);
        goto out;
    }
 
    tsk->files = NULL;
    error = -ENOMEM;
    newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
    if (!newf)
        goto out;
 
    atomic_set(&newf->count, 1);
 
    newf->file_lock      = RW_LOCK_UNLOCKED;
    newf->next_fd     = 0;
    newf->max_fds     = NR_OPEN_DEFAULT;
    newf->max_fdset      = __FD_SETSIZE;
    newf->close_on_exec = &newf->close_on_exec_init;
    newf->open_fds       = &newf->open_fds_init;
    newf->fd     = &newf->fd_array[0];
 
    /* We don't yet have the oldf readlock, but even if the old
           fdset gets grown now, we'll only copy up to "size" fds */
    size = oldf->max_fdset;
    if (size > __FD_SETSIZE) {
        newf->max_fdset = 0;
        write_lock(&newf->file_lock);
        error = expand_fdset(newf, size);
        write_unlock(&newf->file_lock);
        if (error)
            goto out_release;
    }
    read_lock(&oldf->file_lock);
 
    open_files = count_open_files(oldf, size);
 
    /*
     * Check whether we need to allocate a larger fd array.
     * Note: we're not a clone task, so the open count won't
     * change.
     */
    nfds = NR_OPEN_DEFAULT;
    if (open_files > nfds) {
        read_unlock(&oldf->file_lock);
        newf->max_fds = 0;
        write_lock(&newf->file_lock);
        error = expand_fd_array(newf, open_files);
        write_unlock(&newf->file_lock);
        if (error)
            goto out_release;
        nfds = newf->max_fds;
        read_lock(&oldf->file_lock);
    }
 
    old_fds = oldf->fd;
    new_fds = newf->fd;
 
    memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
    memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
 
    for (i = open_files; i != 0; i--) {
        struct file *f = *old_fds++;
        if (f)
            get_file(f);
        *new_fds++ = f;
    }
    read_unlock(&oldf->file_lock);
 
    /* compute the remainder to be cleared */
    size = (newf->max_fds - open_files) * sizeof(struct file *);
 
    /* This is long word aligned thus could use a optimized version */
    memset(new_fds, 0, size);
 
    if (newf->max_fdset > open_files) {
        int left = (newf->max_fdset-open_files)/8;
        int start = open_files / (8 * sizeof(unsigned long));
       
        memset(&newf->open_fds->fds_bits[start], 0, left);
        memset(&newf->close_on_exec->fds_bits[start], 0, left);
    }
 
    tsk->files = newf;
    error = 0;
out:
    return error;
 
out_release:
    free_fdset (newf->close_on_exec, newf->max_fdset);
    free_fdset (newf->open_fds, newf->max_fdset);
    kmem_cache_free(files_cachep, newf);
    goto out;
}
 
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
{
    struct signal_struct *sig;
 
    if (clone_flags & CLONE_SIGHAND) {
        atomic_inc(&current->sig->count);
        return 0;
    }
    sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
    tsk->sig = sig;
    if (!sig)
        return -1;
    spin_lock_init(&sig->siglock);
    atomic_set(&sig->count, 1);
    memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
    return 0;
}
 
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
    unsigned long new_flags = p->flags;
 
    new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
    new_flags |= PF_FORKNOEXEC;
    if (!(clone_flags & CLONE_PTRACE))
        p->ptrace = 0;
    if (clone_flags & CLONE_VFORK)
        new_flags |= PF_VFORK;
    p->flags = new_flags;
}

父进程中在调用fork()派生新进程，实际上相当于创建了进程的一个拷贝；复制出来的子进程有自己的 task_struct结构和系统空间堆栈，但与父进程共享其他所有的资源。Linux为此提供了两个系统调用，一个是fork()，另一个是clone()。我们现在主要讨论fork()。fork()是全部复制，父进程所需的资源全部通过数据结构的复制传递给子进程，而完成这一操作的函数定义就是上方所写的代码段。调用fork时，内核会在copy_mm函数中处理子进程的mm_struct,在copy_files函数中处理拷贝父进程打开的文件的相关事宜,在copy_fs中记录进程所在文件系统的根目录和当前目录信息， copy_sighand中复制进程对信号的处理方式。

/*
 *  Ok, this is the main fork-routine. It copies the system process
 * information (task[nr]) and sets up the necessary registers. It also
 * copies the data segment in its entirety.  The "stack_start" and
 * "stack_top" arguments are simply passed along to the platform
 * specific copy_thread() routine.  Most platforms ignore stack_top.
 * For an example that's using stack_top, see
 * arch/ia64/kernel/process.c.
 */
 
int do_fork(unsigned long clone_flags, unsigned long stack_start,struct pt_regs *regs, unsigned long stack_size)
{
    int retval = -ENOMEM;
    struct task_struct *p;
    DECLARE_MUTEX_LOCKED(sem);
 
    if (clone_flags & CLONE_PID) {
        /* This is only allowed from the boot up thread */
 
        if (current->pid)
            return -EPERM;
    }
   
    current->vfork_sem = &sem;
 
    p = alloc_task_struct();
    if (!p)
        goto fork_out;
 
    *p = *current;
 
    retval = -EAGAIN;
    if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
        goto bad_fork_free;
    atomic_inc(&p->user->__count);
    atomic_inc(&p->user->processes);
 
    /*
     * Counter increases are protected by
     * the kernel lock so nr_threads can't
     * increase under us (but it may decrease).
     */
 
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;
   
    get_exec_domain(p->exec_domain);
 
    if (p->binfmt && p->binfmt->module)
        __MOD_INC_USE_COUNT(p->binfmt->module);
 
    p->did_exec = 0;
    p->swappable = 0;
    p->state = TASK_UNINTERRUPTIBLE;
 
    copy_flags(clone_flags, p);
    p->pid = get_pid(clone_flags);
 
    p->run_list.next = NULL;
    p->run_list.prev = NULL;
 
    if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
        p->p_opptr = current;
        if (!(p->ptrace & PT_PTRACED))
            p->p_pptr = current;
    }
    p->p_cptr = NULL;
    init_waitqueue_head(&p->wait_chldexit);
    p->vfork_sem = NULL;
    spin_lock_init(&p->alloc_lock);
 
    p->sigpending = 0;
    init_sigpending(&p->pending);
 
    p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
    p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
    init_timer(&p->real_timer);
    p->real_timer.data = (unsigned long) p;
 
    p->leader = 0;       /* session leadership doesn't inherit */
    p->tty_old_pgrp = 0;
    p->times.tms_utime = p->times.tms_stime = 0;
    p->times.tms_cutime = p->times.tms_cstime = 0;
#ifdef CONFIG_SMP
    {
        int i;
        p->has_cpu = 0;
        p->processor = current->processor;
        /* ?? should we just memset this ?? */
 
        for(i = 0; i < smp_num_cpus; i++)
            p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
        spin_lock_init(&p->sigmask_lock);
    }
#endif
    p->lock_depth = -1;       /* -1 = no lock */
    p->start_time = jiffies;
 
    retval = -ENOMEM;
    /* copy all the process information */
 
    if (copy_files(clone_flags, p))
        goto bad_fork_cleanup;
    if (copy_fs(clone_flags, p))
        goto bad_fork_cleanup_files;
    if (copy_sighand(clone_flags, p))
        goto bad_fork_cleanup_fs;
    if (copy_mm(clone_flags, p))
        goto bad_fork_cleanup_sighand;
    retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
    if (retval)
        goto bad_fork_cleanup_sighand;
    p->semundo = NULL;
   
    /* Our parent execution domain becomes current domain
       These must match for thread signalling to apply */
      
    p->parent_exec_id = p->self_exec_id;
 
    /* ok, now we should be set up.. */
    p->swappable = 1;
    p->exit_signal = clone_flags & CSIGNAL;
    p->pdeath_signal = 0;
 
    /*
     * "share" dynamic priority between parent and child, thus the
     * total amount of dynamic priorities in the system doesnt change,
     * more scheduling fairness. This is only important in the first
     * timeslice, on the long run the scheduling behaviour is unchanged.
     */
    p->counter = (current->counter + 1) >> 1;
    current->counter >>= 1;
    if (!current->counter)
        current->need_resched = 1;
 
    /*
     * Ok, add it to the run-queues and make it
     * visible to the rest of the system.
     *
     * Let it rip!
     */
    retval = p->pid;
    p->tgid = retval;
    INIT_LIST_HEAD(&p->thread_group);
    write_lock_irq(&tasklist_lock);
    if (clone_flags & CLONE_THREAD) {
        p->tgid = current->tgid;
        list_add(&p->thread_group, &current->thread_group);
    }
    SET_LINKS(p);
    hash_pid(p);
    nr_threads++;
    write_unlock_irq(&tasklist_lock);
 
    if (p->ptrace & PT_PTRACED)
        send_sig(SIGSTOP, p, 1);
 
    wake_up_process(p);      /* do this last */
 
    ++total_forks;
 
fork_out:
    if ((clone_flags & CLONE_VFORK) && (retval > 0))
        down(&sem);
    return retval;
 
bad_fork_cleanup_sighand:
    exit_sighand(p);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup:
    put_exec_domain(p->exec_domain);
    if (p->binfmt && p->binfmt->module)
        __MOD_DEC_USE_COUNT(p->binfmt->module);
bad_fork_cleanup_count:
    atomic_dec(&p->user->processes);
    free_uid(p->user);
bad_fork_free:
    free_task_struct(p);
    goto fork_out;
}

如开头注释第一句所说，这部分代码是fork.c中最主要的函数。

do_fork首先进行一些参数及权限的检查,仅允许从线程启动。之后进行内存的分配，复制父进程的task_struct。判断进程数量，将从父进程中继承的task_struct初始化，获取新的pid，分配CPU，解锁后设定运行时间。将子进程的pid放入pidhash表中，就可以唤醒子进程了。代码中间部分有设置进程判断，若发现非法进程会直接清理掉。清理函数在代码尾部定义。

/* SLAB cache for signal_struct structures (tsk->sig) */
kmem_cache_t *sigact_cachep;
 
/* SLAB cache for files_struct structures (tsk->files) */
kmem_cache_t *files_cachep;
 
/* SLAB cache for fs_struct structures (tsk->fs) */
kmem_cache_t *fs_cachep;
 
/* SLAB cache for vm_area_struct structures */
kmem_cache_t *vm_area_cachep;
 
/* SLAB cache for mm_struct structures (tsk->mm) */
kmem_cache_t *mm_cachep;
 
void __init proc_caches_init(void)
{
    sigact_cachep = kmem_cache_create("signal_act",
            sizeof(struct signal_struct), 0,
            SLAB_HWCACHE_ALIGN, NULL, NULL);
    if (!sigact_cachep)
        panic("Cannot create signal action SLAB cache");
 
    files_cachep = kmem_cache_create("files_cache",
             sizeof(struct files_struct), 0,
             SLAB_HWCACHE_ALIGN, NULL, NULL);
    if (!files_cachep)
        panic("Cannot create files SLAB cache");
 
    fs_cachep = kmem_cache_create("fs_cache",
             sizeof(struct fs_struct), 0,
             SLAB_HWCACHE_ALIGN, NULL, NULL);
    if (!fs_cachep)
        panic("Cannot create fs_struct SLAB cache");
 
    vm_area_cachep = kmem_cache_create("vm_area_struct",
            sizeof(struct vm_area_struct), 0,
            SLAB_HWCACHE_ALIGN, NULL, NULL);
    if(!vm_area_cachep)
        panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
 
    mm_cachep = kmem_cache_create("mm_struct",
            sizeof(struct mm_struct), 0,
            SLAB_HWCACHE_ALIGN, NULL, NULL);
    if(!mm_cachep)
        panic("vma_init: Cannot alloc mm_struct SLAB cache");
}