标签:
父进程fork出子进程:
fork经过系统调用,来到了sys_fork,详细过程请参考Linux内核源代码情景分析-系统调用。
asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, ®s, 0); }
int do_fork(unsigned long clone_flags, unsigned long stack_start, //stack_start为用户空间堆栈指针
struct pt_regs *regs, unsigned long stack_size)
{
int retval = -ENOMEM;
struct task_struct *p;
DECLARE_MUTEX_LOCKED(sem);
if (clone_flags & CLONE_PID) {
/* This is only allowed from the boot up thread */
if (current->pid)
return -EPERM;
}
current->vfork_sem = &sem;//如果clone_flags中CLONE_VFORK位置1,这个信号量用于up(&sem),使父进程唤醒
p = alloc_task_struct();//为子进程分配两个连续的物理页面,低端用作子进程的task_struct结构,高端则用作其系统空间堆栈
if (!p)
goto fork_out;
*p = *current;//父进程的整个task_struct就被复制到了子进程的数据结构
retval = -EAGAIN;
if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
goto bad_fork_free;
atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
/*
* Counter increases are protected by
* the kernel lock so nr_threads can‘t
* increase under us (but it may decrease).
*/
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
get_exec_domain(p->exec_domain);
if (p->binfmt && p->binfmt->module)
__MOD_INC_USE_COUNT(p->binfmt->module);
p->did_exec = 0;
p->swappable = 0;
p->state = TASK_UNINTERRUPTIBLE;//不可中断等待状态
copy_flags(clone_flags, p);//将参数clone_flags中的标志位略加补充和变换,然后写入p->flags
p->pid = get_pid(clone_flags);//获取进程pid
p->run_list.next = NULL;
p->run_list.prev = NULL;
if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
p->p_opptr = current;
if (!(p->ptrace & PT_PTRACED))
p->p_pptr = current;
}
p->p_cptr = NULL;
init_waitqueue_head(&p->wait_chldexit);
p->vfork_sem = NULL;
spin_lock_init(&p->alloc_lock);
p->sigpending = 0;
init_sigpending(&p->pending);
p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;
p->leader = 0; /* session leadership doesn‘t inherit */
p->tty_old_pgrp = 0;
p->times.tms_utime = p->times.tms_stime = 0;
p->times.tms_cutime = p->times.tms_cstime = 0;
#ifdef CONFIG_SMP
{
int i;
p->has_cpu = 0;
p->processor = current->processor;
/* ?? should we just memset this ?? */
for(i = 0; i < smp_num_cpus; i++)
p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
spin_lock_init(&p->sigmask_lock);
}
#endif
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
retval = -ENOMEM;
/* copy all the process information */
if (copy_files(clone_flags, p))//有条件地复制已打开文件的控制结构files_struct,这种复制只有在clone_flags中CLONE_FILES标志位为0时才真正进行,否则就只是共享父进程的指针
goto bad_fork_cleanup;
if (copy_fs(clone_flags, p))//有条件地复制文件系统相关结构files_structfs_struct,这种复制只有在clone_flags中CLONE_FS标志位为0时才真正进行,否则就只是共享父进程的指针
goto bad_fork_cleanup_files;
if (copy_sighand(clone_flags, p))//有条件地复制信号处理相关结构signal_struct,这种复制只有在clone_flags中CLONE_SIGHAND标志位为0时才真正进行,否则就只是共享父进程的指针
goto bad_fork_cleanup_fs;
if (copy_mm(clone_flags, p))//有条件地复制内存管理相关结构mm_struct及其下属的vm_area_struct,这种复制只有在clone_flags中CLONE_VM标志位为0时才真正进行,否则就只是共享父进程的指针
goto bad_fork_cleanup_sighand;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);//实际上却只是复制父进程的系统空间堆栈
if (retval)
goto bad_fork_cleanup_sighand;
p->semundo = NULL;
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
p->swappable = 1;
p->exit_signal = clone_flags & CSIGNAL;//本进程执行exit()时应向父进程发出的信号,CSIGNAL
p->pdeath_signal = 0;
/*
* "share" dynamic priority between parent and child, thus the
* total amount of dynamic priorities in the system doesnt change,
* more scheduling fairness. This is only important in the first
* timeslice, on the long run the scheduling behaviour is unchanged.
*/
p->counter = (current->counter + 1) >> 1;
current->counter >>= 1;//task_struct结构中counter字段的值就是进程的运行时间配额,这里将父进程的时间配额分成两半,让父、子进程各有原值的一半。
if (!current->counter)
current->need_resched = 1;
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
*
* Let it rip!
*/
retval = p->pid;
p->tgid = retval;
INIT_LIST_HEAD(&p->thread_group);
write_lock_irq(&tasklist_lock);
if (clone_flags & CLONE_THREAD) {
p->tgid = current->tgid;
list_add(&p->thread_group, ¤t->thread_group);
}
SET_LINKS(p);//将子进程的task_struct结构链入内核的进程队列
hash_pid(p);//将其链入按其pid计算得的杂凑队列
nr_threads++;//进程数加1
write_unlock_irq(&tasklist_lock);
if (p->ptrace & PT_PTRACED)
send_sig(SIGSTOP, p, 1);
wake_up_process(p); //将子进程"唤醒",也就是将其挂入可执行进程队列等待调用
++total_forks;
fork_out:
if ((clone_flags & CLONE_VFORK) && (retval > 0))//如果clone_flags中CLONE_VFORK位置1
down(&sem);//让父进程在一个信号量上执行一次down()操作,以达到扣留父进程的目的
return retval;//返回p->pid,也就是子进程的pid
bad_fork_cleanup_sighand:
exit_sighand(p);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup:
put_exec_domain(p->exec_domain);
if (p->binfmt && p->binfmt->module)
__MOD_DEC_USE_COUNT(p->binfmt->module);
bad_fork_cleanup_count:
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
free_task_struct(p);
goto fork_out;
}
其中regs对父进程系统堆栈的指针,stack_start为用户空间堆栈指针。
alloc_task_struct为子进程分配两个连续的物理页面,低端用作子进程的task_struct结构,高端则用作其系统空间堆栈,代码如下:
#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK); new_flags |= PF_FORKNOEXEC; if (!(clone_flags & CLONE_PTRACE)) p->ptrace = 0; if (clone_flags & CLONE_VFORK) new_flags |= PF_VFORK; p->flags = new_flags; }
对于fork来说,clone_flags为SIGCHLD,copy_files,copy_fs,copy_sighand,copy_mm都是要真正复制。
copy_files,代码如下:
static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; int open_files, nfds, size, i, error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) goto out; if (clone_flags & CLONE_FILES) {//clone_flags中CLONE_FILES标志位为1 atomic_inc(&oldf->count);//只是增加计数 goto out; } tsk->files = NULL; error = -ENOMEM; newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); if (!newf) goto out; atomic_set(&newf->count, 1); newf->file_lock = RW_LOCK_UNLOCKED; newf->next_fd = 0; newf->max_fds = NR_OPEN_DEFAULT; newf->max_fdset = __FD_SETSIZE; newf->close_on_exec = &newf->close_on_exec_init; newf->open_fds = &newf->open_fds_init; newf->fd = &newf->fd_array[0]; /* We don‘t yet have the oldf readlock, but even if the old fdset gets grown now, we‘ll only copy up to "size" fds */ size = oldf->max_fdset; if (size > __FD_SETSIZE) { newf->max_fdset = 0; write_lock(&newf->file_lock); error = expand_fdset(newf, size); write_unlock(&newf->file_lock); if (error) goto out_release; } read_lock(&oldf->file_lock); open_files = count_open_files(oldf, size); /* * Check whether we need to allocate a larger fd array. * Note: we‘re not a clone task, so the open count won‘t * change. */ nfds = NR_OPEN_DEFAULT; if (open_files > nfds) { read_unlock(&oldf->file_lock); newf->max_fds = 0; write_lock(&newf->file_lock); error = expand_fd_array(newf, open_files); write_unlock(&newf->file_lock); if (error) goto out_release; nfds = newf->max_fds; read_lock(&oldf->file_lock); } old_fds = oldf->fd; new_fds = newf->fd; memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) get_file(f); *new_fds++ = f; } read_unlock(&oldf->file_lock); /* compute the remainder to be cleared */ size = (newf->max_fds - open_files) * sizeof(struct file *); /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); if (newf->max_fdset > open_files) { int left = (newf->max_fdset-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&newf->open_fds->fds_bits[start], 0, left); memset(&newf->close_on_exec->fds_bits[start], 0, left); } tsk->files = newf; error = 0; out: return error; out_release: free_fdset (newf->close_on_exec, newf->max_fdset); free_fdset (newf->open_fds, newf->max_fdset); kmem_cache_free(files_cachep, newf); goto out; }
待我们学习了文件系统后再仔细分析。
static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) { if (clone_flags & CLONE_FS) {//clone_flags中CLONE_FS标志位为1 atomic_inc(current->fs->count);//只是增加计数 return 0; } tsk->fs = __copy_fs_struct(current->fs); if (!tsk->fs) return -1; return 0; }
static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) { struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don‘t need to lock fs - think why ;-) */ if (fs) { atomic_set(&fs->count, 1); fs->lock = RW_LOCK_UNLOCKED; fs->umask = old->umask; read_lock(&old->lock); fs->rootmnt = mntget(old->rootmnt); fs->root = dget(old->root); fs->pwdmnt = mntget(old->pwdmnt); fs->pwd = dget(old->pwd); if (old->altroot) { fs->altrootmnt = mntget(old->altrootmnt); fs->altroot = dget(old->altroot); } else { fs->altrootmnt = NULL; fs->altroot = NULL; } read_unlock(&old->lock); } return fs; }我们看到,在这里要复制的是fs_struct数据结构,而不复制更深层的数据结构,对于更深层的数据结构通过mntget()和dget()递增响应数据结构中共享计数。
copy_sighand,代码如下:
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { struct signal_struct *sig; if (clone_flags & CLONE_SIGHAND) {//如果clone_flags中CLONE_SIGHAND标志位为1 atomic_inc(current->sig->count);//增加计数 return 0; } sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); tsk->sig = sig; if (!sig) return -1; spin_lock_init(&sig->siglock); atomic_set(&sig->count, 1); memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action)); return 0; }
struct signal_struct { atomic_t count; struct k_sigaction action[_NSIG]; spinlock_t siglock; };
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; tsk->nswap = tsk->cnswap = 0; tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm)//如果是内核线程,那么oldmm为null,直接返回 return 0; if (clone_flags & CLONE_VM) {//如果clone_flags中CLONE_VM标志位为1 atomic_inc(&oldmm->mm_users);//增加mm_users计数 mm = oldmm; goto good_mm; } retval = -ENOMEM;//clone_flags中CLONE_VM标志位为0 mm = allocate_mm();//分配mm_struct if (!mm) goto fail_nomem; /* Copy the current MM stuff.. */ memcpy(mm, oldmm, sizeof(*mm)); if (!mm_init(mm))//初始化mm_struct goto fail_nomem; down(&oldmm->mmap_sem); retval = dup_mmap(mm);//vm_area_struct数据结构和页面映射表的复制 up(&oldmm->mmap_sem); /* * Add it to the mmlist after the parent. * * Doing it this way means that we can order * the list, and fork() won‘t mess up the * ordering significantly. */ spin_lock(&mmlist_lock); list_add(&mm->mmlist, &oldmm->mmlist); spin_unlock(&mmlist_lock); if (retval) goto free_pt; /* * child gets a private LDT (if there was an LDT in the parent) */ copy_segments(tsk, mm);//对ldt来说,我们不关心 if (init_new_context(tsk,mm))//空语句 goto free_pt; good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; free_pt: mmput(mm); fail_nomem: return retval; }显然,对mm_struct的复制也只是在clone_flags中CLONE_VM标志位为0时才真正进行,否则就只是通过已经复制的指针共享父进程的用户空间。对mm_struct的复制就不只是局限于这个数据结构本身了,也包括了对更深层数据结构的复制。其中最重要的是vm_area_struct数据结构和页面映射表的复制,这是由dup_mmap()复制的。
#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
static struct mm_struct * mm_init(struct mm_struct * mm) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_MUTEX(&mm->mmap_sem); mm->page_table_lock = SPIN_LOCK_UNLOCKED; mm->pgd = pgd_alloc();//指向新分配的页目录表 if (mm->pgd) return mm; free_mm(mm); return NULL; }
static inline int dup_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt, *tmp, **pprev; int retval; flush_cache_mm(current->mm); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_avl = NULL; mm->mmap_cache = NULL; mm->map_count = 0; mm->cpu_vm_mask = 0; mm->swap_cnt = 0; mm->swap_address = 0; pprev = &mm->mmap; for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {//对于父进程的所有虚拟空间进行轮询 struct file *file; retval = -ENOMEM; if(mpnt->vm_flags & VM_DONTCOPY) continue; tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//分配子进程的vm_struct if (!tmp) goto fail_nomem; *tmp = *mpnt;//父进程的vm_struct复制给子进程vm_struct tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; mm->map_count++;//虚拟空间数加1 tmp->vm_next = NULL; file = tmp->vm_file; if (file) {//假设为null struct inode *inode = file->f_dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ spin_lock(&inode->i_mapping->i_shared_lock); if((tmp->vm_next_share = mpnt->vm_next_share) != NULL) mpnt->vm_next_share->vm_pprev_share = &tmp->vm_next_share; mpnt->vm_next_share = tmp; tmp->vm_pprev_share = &mpnt->vm_next_share; spin_unlock(&inode->i_mapping->i_shared_lock); } /* Copy the pages, but defer checking for errors */ retval = copy_page_range(mm, current->mm, tmp);//复制虚拟空间对应的页目录表项和页表项 if (!retval && tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); /* * Link in the new vma even if an error occurred, * so that exit_mmap() can clean up the mess. */ *pprev = tmp;//下一个虚拟空间 pprev = &tmp->vm_next; if (retval) goto fail_nomem; } retval = 0; if (mm->map_count >= AVL_MIN_MAP_COUNT)//当虚拟空间数大于AVL_MIN_MAP_COUNT build_mmap_avl(mm);//形成avl树,方便查找 fail_nomem: flush_tlb_mm(current->mm); return retval; }
int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;//可写,而又不是共享 src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; for (;;) { pmd_t * src_pmd, * dst_pmd; src_pgd++; dst_pgd++; /* copy_pmd_range */ if (pgd_none(*src_pgd)) goto skip_copy_pmd_range; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; if (!address || (address >= end)) goto out; continue; } if (pgd_none(*dst_pgd)) { if (!pmd_alloc(dst_pgd, 0)) goto nomem; } src_pmd = pmd_offset(src_pgd, address); dst_pmd = pmd_offset(dst_pgd, address); do { pte_t * src_pte, * dst_pte; /* copy_pte_range */ if (pmd_none(*src_pmd)) goto skip_copy_pte_range; if (pmd_bad(*src_pmd)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (address >= end) goto out; goto cont_copy_pmd_range; } if (pmd_none(*dst_pmd)) { if (!pte_alloc(dst_pmd, 0)) goto nomem; } src_pte = pte_offset(src_pmd, address); dst_pte = pte_offset(dst_pmd, address); do { pte_t pte = *src_pte; struct page *ptepage; /* copy_one_pte */ if (pte_none(pte)) //第一种情况 goto cont_copy_pte_range_noset; if (!pte_present(pte)) { //第二种情况 swap_duplicate(pte_to_swp_entry(pte)); goto cont_copy_pte_range; } ptepage = pte_page(pte);//得到页表项所指的页面 if ((!VALID_PAGE(ptepage)) || //第三种情况 PageReserved(ptepage)) goto cont_copy_pte_range; /* If it‘s a COW mapping, write protect it both in the parent and the child */ if (cow) {//第四种情况 ptep_set_wrprotect(src_pte);//改成只读 pte = *src_pte; } /* If it‘s a shared mapping, mark it clean in the child */ if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(ptepage);//增加页面使用计数 //cow为0时,只读页面,第五种情况 cont_copy_pte_range: set_pte(dst_pte, pte);//将此表项复制到子进程的页表项 cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out; src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } out: return 0; nomem: return -ENOMEM; }开头是对页目录表项的循环,中间是对中间目录项的循环,最后是对页表项的循环,我们把注意力放在最后一层循环,也就是对页表项的循环。
循环中检查父进程一个页表中的每一个表项,根据表项的内容决定具体的操作。而表项的内容,则无非是下面这么一些可能:
1、表项的内容为全0,所以pte_none()返回1。说明该页面的映射尚未建立,或者说是个“空洞”,因此不需要做任何事。
2、表项的最低位,即_PAGE_PRESENT标志位为0,所以pte_present返回1。说明映射已建立,但是该页面目前不在内存中,已经被调出到交换设备上。此时表项的内容指向"盘面页面"的地点,而现在该盘上页面多了一个"用户",所以要通过swap_duplicate()递增它的共享计数,就转到cont_copy_pte_range将此表项复制到子进程的页表项。
3、映射已建立,但是物理页面不是一个有效的内存页面,所以VALID_PAGE()返回0。读者可以回顾一下,我们以前讲过有些物理页面在外设接口卡上,相应的地址为“总线地址”,而并不是内存页面。这样的页面,就转到cont_copy_pte_range将此表项复制到子进程的页表项。
4、需要从父进程复制的可写页面。本来,此时应该分配一个空闲的内存页面,再从父进程的页面把内容复制过来,并为之建立映射。显然,这个代价是不小的。然后,对这么辛辛苦苦复制下来的页面,子进程是否一定会用呢?特别是会有写访问么?如果只是读访问,则只要父进程从此不再写这个页面,就完全可以通过复制指针来共享这个页面,那不知要省事多少了。所以,Linux内核采用了一种称为"copy on write"的技术,先通过复制页表项暂时共享这个页面,到子进程真的要写着个页面时再次分配页面和复制。变量cow是"copy on write"的缩写。可写,而又不是共享。实际上,对于绝大多数的可写虚拟区间,cow都是1。在通过复制页表项暂时共享一个页表项时要做两件重要的事情,首先将父进程的页表项改成写保护(只读),然后把已经改成写保护的表项设置到子进程的页表项。这样一来,响应的页面在两个进程中都变成"只读"了,当不管是父进程或是子进程企图写入该页面时,都会引起一次页面异常。而页面异常处理程序对此的反应则是另行分配一个物理页面,并把内容真正地复制到新的物理页面中,让父、子进程各自拥有自己的物理页面,然后将两个页表项中相应的表项改成可写。可是copy_on_write只有在父、子进程各自拥有自己的页表时才能实现。当CLONE_VM标志位为1时,因为父、子进程通过指针共享用户空间,copy_on_write就用不上了。此时,父、子进程是在真正的意义上共享用户空间,父进程写入其用户空间的内容同时也“写入”子进程的用户空间。
5、父进程的只读页面。这种页面本来就不需要复制。因而可以复制页表项共享物理页面。
返回到do_fork,继续执行copy_thread,代码如下:
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { struct pt_regs * childregs; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;//指向了子进程系统空间堆栈中的pt_regs结构 struct_cpy(childregs, regs);//把当前进程系统空间堆栈中的pt_regs结构复制过去 childregs->eax = 0;//子进程系统空间堆栈中的pt_regs结构eax置成0 childregs->esp = esp;//子进程系统空间堆栈中的pt_regs结构esp置成这里的参数esp,在fork中,则来自调用do_fork()前夕的regs.esp,所以实际上并没有改变 p->thread.esp = (unsigned long) childregs;//子进程系统空间堆栈中pt_regs结构的起始地址 p->thread.esp0 = (unsigned long) (childregs+1);//指向子进程的系统空间堆栈的顶端 p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); unlazy_fpu(current); struct_cpy(&p->thread.i387, ¤t->thread.i387); return 0; }最后形成如下图:
二、clone和vfork
clone的用户态接口是:int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg)。
asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, ®s, 0); } asmlinkage int sys_clone(struct pt_regs regs) { unsigned long clone_flags; unsigned long newsp; clone_flags = regs.ebx;//就是用户态的flags newsp = regs.ecx;//就是用户态的child_stack if (!newsp) newsp = regs.esp; return do_fork(clone_flags, newsp, ®s, 0); } /* * This is trivial, and on the face of it looks like it * could equally well be done in user mode. * * Not so, for quite unobvious reasons - register pressure. * In user mode vfork() cannot have a stack frame, and if * done by calling the "clone()" system call directly, you * do not have enough call-clobbered registers to hold all * the information you need. */ asmlinkage int sys_vfork(struct pt_regs regs) { return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0);//主要区别是有两个标志位CLONE_VFORK,CLONE_VM }
if (clone_flags & CLONE_VM) {//如果clone_flags中CLONE_VM标志位为1 atomic_inc(&oldmm->mm_users);//增加mm_users计数 mm = oldmm; goto good_mm; }
fork_out: if ((clone_flags & CLONE_VFORK) && (retval > 0))//如果clone_flags中CLONE_VFORK位置1 down(&sem);//让父进程在一个信号量上执行一次down()操作,以达到扣留父进程的目的 return retval;当调用do_fork的参数中CLONE_VFORK标志位为1时,一定要保证让子进程先运行,一直到子进程通过系统调用execve执行一个新的可执行程序或者通过系统调用exit()退出系统时,才可以恢复父进程的运行。为什么呢?在创建子进程时,如果CLONE_VM为1,只是简单地复制父进程的task_struct结构中指向其mm_struct结构的指针来共享。此时,父、子进程是在真正的意义上共享用户空间,父进程写入其用户空间的内容同时也“写入”子进程的用户空间。绝不能让两个进程都回到用户空间并发地运行;否则,必然是两个进程最终都乱来一气后者因非法越界访问而死亡。解决的办法只能是”扣留“其中一个进程,而只让一个进程回到用户空间,直到两个进程不再共享它们的用户空间后者其中一个进程消亡为至。
void mm_release(void) { struct task_struct *tsk = current; /* notify parent sleeping on vfork() */ if (tsk->flags & PF_VFORK) { tsk->flags &= ~PF_VFORK; up(tsk->p_opptr->vfork_sem); } }
int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { long retval, d0; __asm__ __volatile__( "movl %%esp,%%esi\n\t" //系统调用前的堆栈指针赋值给esi "int $0x80\n\t" "cmpl %%esp,%%esi\n\t" //系统调用后的堆栈指针和系统调用前的堆栈指针相比,如果不同就是子进程,如果相同就是父进程 "je 1f\n\t" //跳到父进程 "movl %4,%%eax\n\t"//把参数arg压入堆栈,作为参数 "pushl %%eax\n\t" "call *%5\n\t" //call fn "movl %3,%0\n\t" //eax为_NR_exit "int $0x80\n" //执行exit系统调用 "1:\t" :"=&a" (retval), "=&S" (d0) :"0" (__NR_clone), "i" (__NR_exit),//eax为_NR_clone "r" (arg), "r" (fn), "b" (flags | CLONE_VM)//ebx为flags | CLONE_VM : "memory"); return retval; }
asmlinkage int sys_clone(struct pt_regs regs) { unsigned long clone_flags; unsigned long newsp; clone_flags = regs.ebx;//就是用户态的flags | CLONE_VM newsp = regs.ecx;//newsp为null if (!newsp) newsp = regs.esp; return do_fork(clone_flags, newsp, ®s, 0); }那么kernel_thread出来的是内核线程,mm指针为null,copy_mm中代码如下:
oldmm = current->mm; if (!oldmm)//如果是内核线程,那么oldmm为null,直接返回 return 0;
最后附上,所有标志位的作用:
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ #define CLONE_VM 0x00000100 /* set if VM shared between processes */ #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ #define CLONE_PID 0x00001000 /* set if pid shared */ #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
标签:
原文地址:http://blog.csdn.net/jltxgcy/article/details/44451933