本文主要介绍linux内核中进程地址空间的数据结构描述,包括mm_struct/vm_area_struct。进程线性地址区间的分配流程,并对相应的源代码做了注释。
内核中的函数以相当直接了当的方式获得动态内存。当给用户态进程分配内存时,情况完全不同了。进程对动态内存的请求被认为是不紧迫的,一般来说,内核总是尽量推迟给用户态进程分配内存。由于用户进程时不可信任的,因此,内核必须能随时准备捕获用户态进程引起的所有寻址错误。当用户态进程请求动态内存时,并没有获得请求的页框,而仅仅获得对一个新的线性地址区间的使用权,而这一线性地址区间就成为进程地址空间的一部分。
进程地址空间由允许进程使用的全部线性地址组成。内核可以通过增加或删除某些线程地址区间来动态地修改进程的地址空间。内核通过所谓线性去得资源来标示线性地址区间,线性区是由起始线性地址、长度和一些访问权限来描述的。进程获得新线性区的一些典型情况:
1.但用户在控制台输入一条命令时,shell进程创建一个新的进程去执行这个命令。结果是,一个全新的地址空间(也就是一组线性区)分配给新进程。
2.正在运行的进程有可能决定装入一个完全不同的程序。这时,进程描述符不变,可是在装入这个程序以前所有的线性区却被释放,并有一组新的线性区被分配给这个进程。
3.正在运行的进程可能对一个文件执行内存映像。
4.进程可能持续向他的用户态堆栈增加数据,知道映像这个堆栈的线性区用完为止,此时,内核也许会决定扩展这个线性区的大小。
5.进程可能创建一个IPC共享线性区来与其他合作进程共享数据。此时,内核给这个进程分配一个新的线性区以实现这个方案。
6.进程可能通过调用类似malloc这样的函数扩展自己的动态堆。结果是,内核可能决定扩展给这个堆所分配的线性区。
数据结构描述
进程描述符task_struct中的mm字段描述了进程地址空间
- struct mm_struct {
- struct vm_area_struct * mmap;
- struct rb_root mm_rb;
- struct vm_area_struct * mmap_cache;
- unsigned long (*get_unmapped_area) (struct file *filp,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags);
- void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
- unsigned long mmap_base;
- unsigned long task_size;
- unsigned long cached_hole_size;
- unsigned long free_area_cache;
- pgd_t * pgd;
- atomic_t mm_users;
- atomic_t mm_count;
- int map_count;
- struct rw_semaphore mmap_sem;
- spinlock_t page_table_lock;
-
- struct list_head mmlist;
-
-
- mm_counter_t _file_rss;
- mm_counter_t _anon_rss;
-
- unsigned long hiwater_rss;
- unsigned long hiwater_vm;
-
- unsigned long total_vm, locked_vm, shared_vm, exec_vm;
- unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
- unsigned long start_code, end_code, start_data, end_data;
- unsigned long start_brk, brk, start_stack;
- unsigned long arg_start, arg_end, env_start, env_end;
-
- unsigned long saved_auxv[AT_VECTOR_SIZE];
-
- struct linux_binfmt *binfmt;
-
- cpumask_t cpu_vm_mask;
-
-
- mm_context_t context;
-
-
-
- unsigned int faultstamp;
- unsigned int token_priority;
- unsigned int last_interval;
-
- unsigned long flags;
-
- struct core_state *core_state;
- #ifdef CONFIG_AIO
- spinlock_t ioctx_lock;
- struct hlist_head ioctx_list;
- #endif
- #ifdef CONFIG_MM_OWNER
-
- struct task_struct *owner;
- #endif
-
- #ifdef CONFIG_PROC_FS
-
- struct file *exe_file;
- unsigned long num_exe_file_vmas;
- #endif
- #ifdef CONFIG_MMU_NOTIFIER
- struct mmu_notifier_mm *mmu_notifier_mm;
- #endif
- };
关于mm_users字段和mm_count字段
mm_users字段存放共享mm_struct数据结构的轻量级进程的个数。mm_count字段是内存描述符的主使计数器,在mm_users次使用计数器中的所有用户在mm_count中只作为一个单位,每当mm_count递减时,内核都要检查他是否变为0,如果是,就要解除这个内存描述符,因为不再有用户使用他。
用一个例子解释mm_users和mm_count之间的不同。考虑一个内存描述符由两个轻量级进程共享。他的mm_users字段通常存放的值为2,而mm_count字段存放的值为1(两个所有者进程算作一个)。如果把内存描述符在一个长操作的中间不被释放,那么,就应该增加mm_users字段而不是mm_count字段的值。最终结果是相同的,因为mm_users的增加确保了mm_count不变为0,即使拥有这个内存描述符的所有轻量级进程全部死亡。
内核线程仅运行在内核态,因此,他们永远不会访问低于TASK_SIZE(等于PAGE_OFFSET,通常为0xc0000000)的地址。与普通进程相反,内核线程不用线性区,因此,内存描述符的很多字段对内核线程是没有意义的。也就是说,当创建内核线程时,内核线程的active_mm共享父进程的mm,但是只使用mm中部分数据与变量。
线性区
linux通过类型为vm_area_struct的对象实现线性区,它的字段为
- struct vm_area_struct {
- struct mm_struct * vm_mm;
- unsigned long vm_start;
- unsigned long vm_end;
-
-
- struct vm_area_struct *vm_next;
-
- pgprot_t vm_page_prot;
- unsigned long vm_flags;
-
- struct rb_node vm_rb;
-
-
- union {
- struct {
- struct list_head list;
- void *parent;
- struct vm_area_struct *head;
- } vm_set;
-
- struct raw_prio_tree_node prio_tree_node;
- } shared;
-
-
- struct list_head anon_vma_node;
- struct anon_vma *anon_vma;
-
-
- const struct vm_operations_struct *vm_ops;
-
-
- unsigned long vm_pgoff;
- struct file * vm_file;
- void * vm_private_data;
- unsigned long vm_truncate_count;
-
- #ifndef CONFIG_MMU
- struct vm_region *vm_region;
- #endif
- #ifdef CONFIG_NUMA
- struct mempolicy *vm_policy;
- #endif
- };
进程所拥有的线性区从来不重叠,并且内核尽力把新分配的线性区与邻接的现有线性区进行合并。如果两个相邻区的访问权限相匹配,就能把他们合并在一起。
操作
线性区的处理
我们举一个常用的find_vma函数,是一个从rb树中查找指定的线性区间。其他的函数不再举例。
- struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
- {
- struct vm_area_struct *vma = NULL;
- if (mm) {
-
-
-
-
- vma = mm->mmap_cache;
-
-
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node * rb_node;
-
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
- while (rb_node) {
- struct vm_area_struct * vma_tmp;
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
-
- if (vma)
- mm->mmap_cache = vma;
- }
- }
- return vma;
- }
分配线性地址区间
do_mmap函数为当前进程创建并初始化一个新的线性区。不过,分配成功之后,可以把这个新的线性区与进程已有的其他线性区进行合并。
- static inline unsigned long do_mmap(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long offset)
- {
- unsigned long ret = -EINVAL;
-
- if ((offset + PAGE_ALIGN(len)) < offset)
- goto out;
- if (!(offset & ~PAGE_MASK))
- ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
- out:
- return ret;
- }
我们看do_mmap_pgoff函数做的实际工作
- unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff)
- {
- struct mm_struct * mm = current->mm;
- struct inode *inode;
- unsigned int vm_flags;
- int error;
- unsigned long reqprot = prot;
-
-
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
- prot |= PROT_EXEC;
-
- if (!len)
- return -EINVAL;
-
- if (!(flags & MAP_FIXED))
- addr = round_hint_to_min(addr);
-
- error = arch_mmap_check(addr, len, flags);
- if (error)
- return error;
-
-
- len = PAGE_ALIGN(len);
- if (!len || len > TASK_SIZE)
- return -ENOMEM;
-
-
- if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
-
-
- if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
-
- if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
- if (file)
- return -EINVAL;
-
-
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
- if (IS_ERR(file))
- return PTR_ERR(file);
- }
-
-
-
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
- return addr;
-
-
-
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
- mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
- if (flags & MAP_LOCKED)
- if (!can_do_mlock())
- return -EPERM;
-
-
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
-
- inode = file ? file->f_path.dentry->d_inode : NULL;
-
- if (file) {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
- if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
- return -EACCES;
-
-
- if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
- return -EACCES;
-
-
- if (locks_verify_locked(inode))
- return -EAGAIN;
-
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- if (!(file->f_mode & FMODE_WRITE))
- vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
-
- case MAP_PRIVATE:
- if (!(file->f_mode & FMODE_READ))
- return -EACCES;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
- if (vm_flags & VM_EXEC)
- return -EPERM;
- vm_flags &= ~VM_MAYEXEC;
- }
-
- if (!file->f_op || !file->f_op->mmap)
- return -ENODEV;
- break;
-
- default:
- return -EINVAL;
- }
- } else {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
-
- pgoff = 0;
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- break;
- case MAP_PRIVATE:
-
- pgoff = addr >> PAGE_SHIFT;
- break;
- default:
- return -EINVAL;
- }
- }
-
- error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
- if (error)
- return error;
- error = ima_file_mmap(file, prot);
- if (error)
- return error;
-
- return mmap_region(file, addr, len, flags, vm_flags, pgoff);
- }
我们get_unmapped_area函数获得新的线性地址区间
- unsigned long
- get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
- {
- unsigned long (*get_area)(struct file *, unsigned long,
- unsigned long, unsigned long, unsigned long);
-
- get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
- get_area = file->f_op->get_unmapped_area;
- addr = get_area(file, addr, len, pgoff, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
- if (addr > TASK_SIZE - len)
- return -ENOMEM;
- if (addr & ~PAGE_MASK)
- return -EINVAL;
-
- return arch_rebalance_pgtables(addr, len);
- }
我们看不使用文件的一个,对于和文件相关的一个,在文件系统中再来分析
对于内存相关的get_unmapped_area函数在如下函数中设置
- void arch_pick_mmap_layout(struct mm_struct *mm)
- {
- if (mmap_is_legacy()) {
- mm->mmap_base = mmap_legacy_base();
- mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
- } else {
- mm->mmap_base = mmap_base();
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
- }
- }
我们直接看arch_get_unmmapped_area,其他一个类似。
- unsigned long
- arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
- {
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long start_addr;
-
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- if (flags & MAP_FIXED)
- return addr;
-
- if (addr) {
- addr = PAGE_ALIGN(addr);
-
- vma = find_vma(mm, addr);
-
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
- return addr;
- }
-
-
- if (len > mm->cached_hole_size) {
- start_addr = addr = mm->free_area_cache;
- } else {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-
- full_search:
-
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-
- if (TASK_SIZE - len < addr) {
-
- if (start_addr != TASK_UNMAPPED_BASE) {
- addr = TASK_UNMAPPED_BASE;
- start_addr = addr;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
-
- if (!vma || addr + len <= vma->vm_start) {
-
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
- }
接着上面的调用mmap_region函数
- unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, unsigned long flags,
- unsigned int vm_flags, unsigned long pgoff)
- {
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- int correct_wcount = 0;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
- struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
-
-
- error = -ENOMEM;
- munmap_back:
-
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
-
- if (vma && vma->vm_start < addr + len) {
- if (do_munmap(mm, addr, len))
- return -ENOMEM;
- goto munmap_back;
- }
-
-
-
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
-
- if ((flags & MAP_NORESERVE)) {
-
- if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
- vm_flags |= VM_NORESERVE;
-
-
- if (file && is_file_hugepages(file))
- vm_flags |= VM_NORESERVE;
- }
-
-
- if (accountable_mapping(file, vm_flags)) {
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
-
-
-
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
- if (vma)
- goto out;
-
-
-
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma->vm_mm = mm;
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
-
- if (file) {
- error = -EINVAL;
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
- if (vm_flags & VM_DENYWRITE) {
- error = deny_write_access(file);
- if (error)
- goto free_vma;
- correct_wcount = 1;
- }
- vma->vm_file = file;
- get_file(file);
- error = file->f_op->mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
- if (vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
-
-
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
-
- }
-
- else if (vm_flags & VM_SHARED) {
-
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- }
-
- if (vma_wants_writenotify(vma))
- vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
-
- vma_link(mm, vma, prev, rb_link, rb_parent);
- file = vma->vm_file;
-
-
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- out:
- perf_event_mmap(vma);
-
- mm->total_vm += len >> PAGE_SHIFT;
- vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
-
-
- long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
- if (nr_pages < 0)
- return nr_pages;
- mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
- } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
-
- make_pages_present(addr, addr + len);
- return addr;
-
- unmap_and_free_vma:
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- vma->vm_file = NULL;
- fput(file);
-
-
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
- free_vma:
- kmem_cache_free(vm_area_cachep, vma);
- unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
- }
到这里分配线性地址空间就算走完了,主要完成的工作依次由根据地址和长度在进程地址空间中查找一个未添加进来的线性区间,如果这个区间可以和当前进程线性地址空间的线性区间可以合并,则合并之。如果不能合并,创建一个线性区间,将这个线性区间vma插入到进程现有的线性地址空间里作为他的线性地址空间的一部分。最后对线性区间分配实际的物理页面并返回基地址。