标签:
本节主要研究进程(线程)创建的过程,下文将不区分进程和线程;
在linux系统中,第一个进程是系统固有的,是由内核的设计者安排好的;一个新的进程一定要由一个已存在的进程复制出来,而不是创造出来的,其实linux系统并不提供直接创建进
程的方法;创建了子进程以后,父进程可以继续走自己的路,与子进程分道扬镳,但是如果子进程先行exit(),那么将要向父进程发一个信号;父进程也可以选择睡眠,等子进程
exit()以后再去世,然后父进程再继续执行,可使用wait3()某个特定的子进程,wait4()所有子进程;第三,自己exit()(是每一个可执行程序映像所必有的,因此在子进程中执行完
后,不会返回);linux将进程的创建与目标程序的执行分成两步;
(1)从一个已存在的父进程像细胞分裂一样地复制出一个子进程;实际复制出来的子进程有自己的task_struct和系统空间堆栈,但是与父进程共享其他资源;例如,要是父进程打开了5个文件,那么子进程也打开了这5个文件,而且这些文件的读写位置处于相同的位置;fork()是全部复制,父进程的所有资源全部通过数据结构复制给子进程,但进程号不一样;clone()则带有参数的选择性的复制,可复制出一个线程,其他资源通过指针与父亲来共享;vfork()是除了task_struct和系统空间堆栈外的资源通过指针全部复制,因此复制出来的是个线程,效率很高;
(2)目标程序的执行,创建一个进程是为有不同的目标程序要让新的程序去执行,但复制完以后,子进程就要与父进程分道扬镳了,用execve()执行以文件形式存在的可执行程序映像;
在(1)中,复制时只复制进程基本资源,如task_struct,系统空间堆栈,页面表等,不包括父进程的代码和全局变量,这些通过只读方式的共享,在需要写的时候,通过copy_on_write()为所涉及的页面建立一个新的副本;
(1)clone()主要是用来创建一个线程,包括用户线程和内核线程;创建用户线程时,可以给定子线程用户空间堆栈位置,它也可以用来创建进程,有选择性的复制父进程的资源;fork()则是全面的复制;vfork()是为了提高创建时的效率,减少系统开销;
(2)Linux内核中确实有一个创建内核线程的函数,kernel_thread(),供内核线程调用,它是对clone()的包装,并不执行execve(),而是执行内核中某一个函数,会返回因此要执行一个exit()系统调用;
(3)fork,vfork,clone这三个系统调用都调用do_fork(),只不过调用的参数不一样,下面主要来讲解do_fork();
int sys_fork(struct pt_regs *regs) { //clone_flags中的SIGCHLD return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); }
int sys_vfork(struct pt_regs *regs) { //共享CLONE_VFORK和VM return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); }
//clone负责建立起轻量级进程(可以与其他进程共享地址空间,或打开文件等),newsp是指用户堆栈指针,parent_tid表示父进程的 //的用户变量地址,child_tid表示新的轻量级进程的用户变量地址: long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) newsp = regs->sp; //有新的用户栈地址 //其中clone_flags一般有参数SIGCHLD,占用一个字节,剩余的3个字节可制定,如共享内存描述符,页表,文件目录,信号处理标,跟踪等 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); }
说明几点
(1)newsp为子进程新的栈,该栈可能在另一个地址空间;
/* * Create a kernel thread */ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct pt_regs regs; memset(®s, 0, sizeof(regs)); regs.si = (unsigned long) fn; regs.di = (unsigned long) arg; #ifdef CONFIG_X86_32 regs.ds = __USER_DS; regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU;: regs.gs = __KERNEL_STACK_CANARY; #else regs.ss = __KERNEL_DS; #endif regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ //其中CLONE_VM避免调用进程的页表,内核线程是不用访问用户态的地址空间;不会被跟踪的 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); }
/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ //sys_clone //regs是指通用寄存器指针,它是一个轻量级进程在用户态切换到内核态,保存到内核堆栈中 long do_fork(unsigned long clone_flags, unsigned long stack_start, //用户状态下栈的起始地址 struct pt_regs *regs, //指向寄存器集合的指针 unsigned long stack_size, //用户状态下,栈的大小 int __user *parent_tidptr, //指向用户空间中地址的两个指针 int __user *child_tidptr) { struct task_struct *p; int trace = 0; long nr; /* * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ if (clone_flags & CLONE_NEWUSER) { //创建新的用户 if (clone_flags & CLONE_THREAD) //但是没有创建新的线程 return -EINVAL; /* hopefully this check will go away when userns support is * complete */ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || !capable(CAP_SETGID)) return -EPERM; } /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) trace = tracehook_prepare_clone(clone_flags); //执行生成新进程的实际工作 p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; trace_sched_process_fork(current, p); nr = task_pid_vnr(p); //获得当前的局部nr if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); //将nr复制到对应的用户空间指向的地址 if (clone_flags & CLONE_VFORK) { //如果是执行vfork这个函数,父进程会睡眠下去 p->vfork_done = &vfork; init_completion(&vfork); //睡眠,此时父进程等子进程 } //schedule_tail audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that * hasn't gotten to tracehook_report_clone() yet. Now we * clear it and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p, clone_flags); //将子进程的task_struct放入到新调度器队列中 tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); //如果设置了CLONE_VFORK, //就把父进程插入到等待队列中,直到子进程释放了自己的内存地址空间(也就是子进程结束或执行新的程序) if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); //父进程将在改变量上进入睡眠状态 freezer_count(); tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); } return nr; }说明几点
(1)p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace);执行实际的进程复制工作;
(2)if (clone_flags & CLONE_VFORK) 表示如果是执行vfork这个函数,父进程会睡眠下去;
copy_process中关键代码1
设置task_struct和系统堆栈
//task_struct可以在内存中的任何位置 p = dup_task_struct(current); //为子进程获取并设置进程描述符,并且设置好了thread_info if (!p) goto fork_out;
static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; int err; prepare_to_copy(orig); //保存FPU等寄存器内容到thread_info中 tsk = alloc_task_struct(); //kem, 获取新的进程描述符task_struct的内存 if (!tsk) return NULL; ti = alloc_thread_info(tsk); //task无用处,使用get_free_pages获得两个页大小的内存 if (!ti) { //ti若分配失败,还要释放原内存 free_task_struct(tsk); return NULL; } err = arch_dup_task_struct(tsk, orig); //将旧的task_struct复制给新的task_struct if (err) goto out; tsk->stack = ti; //改变新进程的stack指向到新的thread_info中 err = prop_local_init_single(&tsk->dirties); if (err) goto out; setup_thread_stack(tsk, orig); //链接task_struct和thread_info,确定内存布局,相互指向 clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); #endif /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); //要将新进程的使用计数置为2 atomic_set(&tsk->fs_excl, 0); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; account_kernel_stack(ti, 1); return tsk; out: free_thread_info(ti); free_task_struct(tsk); return NULL; }
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { int ret; *dst = *src; //拷贝两个进程描述符号 if (fpu_allocated(&src->thread.fpu)) { //若源的fpu设置了,那么还要分配thread的fpu内存 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); //清空目的的fpu ret = fpu_alloc(&dst->thread.fpu); if (ret) return ret; fpu_copy(&dst->thread.fpu, &src->thread.fpu); //分配好了,直接拷贝 } return 0; }
copy_process中关键代码2
一些字段的设置
p->did_exec = 0; //记录进程发出execve()的次数 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); //初始化好信号处理 //初始化cpu的统计字段 p->utime = cputime_zero; p->stime = cputime_zero; p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero;
copy_process中关键代码2
设置子进程的调度信息
/* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); //完成对新进程调度程序数据结构的初始化
copy_process中关键代码3
复制和共享进程的各个部分
if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) //进程地址空间的处理 goto bad_fork_cleanup_signal; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; //设置子进程的内核栈 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
copy_files
static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; int error = 0; /* * A background process may not have any files ... */ oldf = current->files; //原进程的files_struct if (!oldf) goto out; if (clone_flags & CLONE_FILES) { //共享打开的文件表 atomic_inc(&oldf->count); //增加引用计数 goto out; } newf = dup_fd(oldf, &error); if (!newf) goto out; tsk->files = newf; error = 0; out: return error; }
copy_thread
int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { struct pt_regs *childregs; struct task_struct *tsk; int err; //填充包含了所有的寄存器 childregs = task_pt_regs(p); *childregs = *regs; childregs->ax = 0; //子进程的返回值,为0 childregs->sp = sp; //子进程的用户空间栈地址 p->thread.sp = (unsigned long) childregs; //指向子进程的用户空间 p->thread.sp0 = (unsigned long) (childregs+1); //指向子进程系统空间堆栈中的pt_regs p->thread.ip = (unsigned long) ret_from_fork; //子进程开始调用的函数 task_user_gs(p) = get_user_gs(regs); p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } set_tsk_thread_flag(p, TIF_IO_BITMAP); } err = 0; /* * Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } return err; }
copy_process中关键代码4
获得子进程pid
if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns); //分配好一个pid if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; } } p->pid = pid_nr(pid); //得到全局的nr p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; //设置好线程组id if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) goto bad_fork_free_pid; } //改变子进程用户地址空间的child_tidptr的内存值 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; //也保存在对应的值上面去 /* * Clear TID on mm_release()? */ //在mm_release时,将0写到child_tidptr中去 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
copy_process中关键代码5
线程还是进程
/* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { //在父进程的同一进程组,同一个父亲 p->real_parent = current->real_parent; //当前线程和创建的线程的父亲同一个线程 p->parent_exec_id = current->parent_exec_id; } else { //否则real_parent指向本进程 p->real_parent = current; p->parent_exec_id = current->self_exec_id; }
线程
if (clone_flags & CLONE_THREAD) { //子进程放入到同一线程组去 current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); p->group_leader = current->group_leader; //指向线程组组长 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); //加入到线程组中去 }
/* * sys_execve() executes a new program. */ long sys_execve(const char __user *name, const char __user *const __user *argv, const char __user *const __user *envp, struct pt_regs *regs) { long error; char *filename; filename = getname(name); //找到可执行文件名 error = PTR_ERR(filename); if (IS_ERR(filename)) return error; error = do_execve(filename, argv, envp, regs); #ifdef CONFIG_X86_32 if (error == 0) { /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); } #endif putname(filename); return error; }
do_execve中关键代码
file = open_exec(filename); //找到file retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; sched_exec(); bprm->file = file; //设置相应的参数 bprm->filename = filename; //名称 bprm->interp = filename; retval = bprm_mm_init(bprm); if (retval) goto out_file; bprm->argc = count(argv, MAX_ARG_STRINGS); //计算长度
retval = search_binary_handler(bprm,regs); //用于找到一种适当的二进制格式,如a.out, elf格式等
int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) { unsigned int depth = bprm->recursion_depth; int try,retval; struct linux_binfmt *fmt; retval = security_bprm_check(bprm); if (retval) return retval; /* kernel module loader fixup */ /* so we don't try to load run modprobe in kernel space. */ set_fs(USER_DS); retval = audit_bprm(bprm); if (retval) return retval; retval = -ENOENT; for (try=0; try<2; try++) { //装入模块后,需再尝试一次 read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; if (!fn) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fn(bprm, regs); //执行对应的装入函数 <span style="font-family: Arial, Helvetica, sans-serif;">load_aout_binary</span>说明几点
(1)load_aout_binary为a.out可执行文件格式的装入,此外还支持elf和脚本等格式文件的装入;
标签:
原文地址:http://blog.csdn.net/skyuppour/article/details/45725055