码迷,mamicode.com
首页 > 系统相关 > 详细

Linux内核源代码情景分析-内存管理之用户堆栈的扩展

时间:2015-03-01 13:16:01      阅读:305      评论:0      收藏:0      [点我收藏+]

标签:

    在下面几种情况下会发生,页面出错异常(也叫缺页中断):

    1、相应的页面目录项或者页面表项为空,也就是该线性地址与物理地址的映射关系尚未建立,或者已经撤销。本文讨论的就是这种情况。

    2、相应的物理页面不在内存中。

    3、指令中规定的访问方式与页面的权限不符,例如企图写一个“只读”的页面。


    首先看下进程地址空间示意图:

技术分享


    假设现在需要调用某个子程序,因此CPU需将返回地址压入堆栈,也就是要将返回地址写入虚拟空间地址为(%esp-4)的地方。可是,在我们这个情景中地址(%esp-4)落入了空洞中,这是尚未映射的地址,因此必然要引起一次页面出错异常。

  

    这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。代码如下:

    arch/i386/mm/fault.c

asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct * vma;
	unsigned long address;
	unsigned long page;
	unsigned long fixup;
	int write;
	siginfo_t info;

	/* get the address */
	__asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中,也就是%esp-4

	tsk = current;//task_struct

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * ‘reference‘ page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 */
	if (address >= TASK_SIZE)
		goto vmalloc_fault;

	mm = tsk->mm;//mm_struct
	info.si_code = SEGV_MAPERR;

	/*
	 * If we‘re in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (in_interrupt() || !mm)
		goto no_context;

	down(&mm->mmap_sem);

	vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。
	if (!vma)//没有找到,说明没有一个区间的结束地址高于给定的地址,参考上图,说明这个地址是在堆栈之下,也就是3G字节以上了。
		goto bad_area;
	if (vma->vm_start <= address)//起始地址不高于address,说明映射已经建立,转到good_area去进一步检查失败原因。
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))//起始地址大于address,说明落到了空洞里面;如果vm_flags为VM_GROWSDOWN,说明落在堆栈区中,不会goto bad_area。
		goto bad_area;
	if (error_code & 4) {//发生在用户态
		/*
		 * accessing the stack below %esp is always a bug.
		 * The "+ 32" is there due to some instructions (like
		 * pusha) doing post-decrement on the stack and that
		 * doesn‘t show up until later..
		 */
		if (address + 32 < regs->esp)//确保这是压栈操作,一次压入堆栈是4个字节,最多是pusha,压入32个字节。
			goto bad_area;
	}
	if (expand_stack(vma, address))//看下面代码注释
		goto bad_area;
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	info.si_code = SEGV_ACCERR;
	write = 0;
	switch (error_code & 3) {// 110 & 011 = 2
		default:	/* 3: write, present */
#ifdef TEST_VERIFY_AREA
			if (regs->cs == KERNEL_CS)
				printk("WP fault at %08lx\n", regs->eip);
#endif
			/* fall through */
		case 2:		/* write, not present */
			if (!(vma->vm_flags & VM_WRITE))
				goto bad_area;
			write++;//执行到这里
			break;
		case 1:		/* read, present */
			goto bad_area;
		case 0:		/* read, not present */
			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
				goto bad_area;
	}

	/*
	 * If for any reason at all we couldn‘t handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	switch (handle_mm_fault(mm, vma, address, write)) {
	case 1:
		tsk->min_flt++;
		break;
	case 2:
		tsk->maj_flt++;
		break;
	case 0:
		goto do_sigbus;
	default:
		goto out_of_memory;
	}

	/*
	 * Did it hit the DOS screen memory VA from vm86 mode?
	 */
	if (regs->eflags & VM_MASK) {
		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
		if (bit < 32)
			tsk->thread.screen_bitmap |= 1 << bit;
	}
	up(&mm->mmap_sem);
	return;
        .......
}

    内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs,它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。

    error_code:
  bit 0 == 0 means no page found, 1 means protection fault
  bit 1 == 0 means read, 1 means write
bit 2 == 0 means kernel, 1 means user-mode

    此时,error_code为110,用户态,尚未映射,写。


    expand_stack函数,代码如下:

static inline int expand_stack(struct vm_area_struct * vma, unsigned long address)
{
	unsigned long grow;

	address &= PAGE_MASK;//地址按页面边界对齐
	grow = (vma->vm_start - address) >> PAGE_SHIFT;//本例中grow为1个页面
	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
	    ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur)
		return -ENOMEM;
	vma->vm_start = address;//起始地址向低地址移了一个页面的距离
	vma->vm_pgoff -= grow;
	vma->vm_mm->total_vm += grow;
	if (vma->vm_flags & VM_LOCKED)
		vma->vm_mm->locked_vm += grow;
	return 0;
}

    handle_mm_fault函数,代码如下:

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
	unsigned long address, int write_access)
{
	int ret = -1;
	pgd_t *pgd;
	pmd_t *pmd;


	pgd = pgd_offset(mm, address);//返回页面表项指针
	pmd = pmd_alloc(pgd, address);//中转了一下,还是页目录表项指针


	if (pmd) {
		pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针
		if (pte)
			ret = handle_pte_fault(mm, vma, address, write_access, pte);
	}
	return ret;
}

    pgd_offset函数,如下:

#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))

    pmd_alloc函数,如下:

extern inline pmd_t * pmd_alloc(pgd_t *pgd, unsigned long address)
{
	if (!pgd)
		BUG();
	return (pmd_t *) pgd;
}

    pte_alloc函数,如下:

extern inline pte_t * pte_alloc(pmd_t * pmd, unsigned long address)
{
	address = (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);//在页表中的偏移

	if (pmd_none(*pmd))//是否存在该页目录项
		goto getnew;//如果没有就去创建
	if (pmd_bad(*pmd))
		goto fix;
	return (pte_t *)pmd_page(*pmd) + address;//有就返回指向页表项的指针
getnew:
{
	unsigned long page = (unsigned long) get_pte_fast();//创建页表
	
	if (!page)
		return get_pte_slow(pmd, address);
	set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(page)));//让页目录项指向页表
	return (pte_t *)page + address;//返回指向页表项的指针
}
fix:
	__handle_bad_pmd(pmd);
	return NULL;

    handle_pte_fault函数,如下:

static inline int handle_pte_fault(struct mm_struct *mm,
	struct vm_area_struct * vma, unsigned long address,
	int write_access, pte_t * pte)
{
	pte_t entry;

	/*
	 * We need the page table lock to synchronize with kswapd
	 * and the SMP-safe atomic PTE updates.
	 */
	spin_lock(&mm->page_table_lock);
	entry = *pte;//页表项中内容
	if (!pte_present(entry)) {//页表项为空
		/*
		 * If it truly wasn‘t present, we know that kswapd
		 * and the PTE updates will not touch it later. So
		 * drop the lock.
		 */
		spin_unlock(&mm->page_table_lock);
		if (pte_none(entry))//页表项为空
			return do_no_page(mm, vma, address, write_access, pte);
		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
	}

	if (write_access) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address, pte, entry);

		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	establish_pte(vma, address, pte, entry);
	spin_unlock(&mm->page_table_lock);
	return 1;
}

    do_no_page函数,如下:

static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
	unsigned long address, int write_access, pte_t *page_table)
{
	struct page * new_page;
	pte_t entry;

	if (!vma->vm_ops || !vma->vm_ops->nopage)//都为空
		return do_anonymous_page(mm, vma, page_table, write_access, address);

	.......
	return 2;	/* Major fault */
}

    do_anonymous_page函数,如下:

static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
	struct page *page = NULL;
	pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
	if (write_access) { //write_access为1
		page = alloc_page(GFP_HIGHUSER);//分配页面
		if (!page)
			return -1;
		clear_user_highpage(page, addr);
		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));//页表项赋予已写过对应的物理页,可进行读、写或者执行
		mm->rss++;
		flush_page_to_ram(page);
	}
	set_pte(page_table, entry);//页表项(属性刚才已经设置了)指向对应的页面
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(vma, addr, entry);
	return 1;	/* Minor fault */
}

    依次返回,从异常处理返回以后,堆栈区已经扩展了,再重新执行一便以前夭折的那条压栈指令,然后就可以继续往下执行了。对于用户程序来说,这整个过程都是“透明”的,就像什么事也没有发生,而堆栈区间就仿佛从一开始就已经分配好了足够大的空间一样。

Linux内核源代码情景分析-内存管理之用户堆栈的扩展

标签:

原文地址:http://blog.csdn.net/jltxgcy/article/details/44001245

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!