【内存管理】缺页异常

时间：2015-05-26 21:28:50 阅读：217 评论：0 收藏：0 [点我收藏+]

标签：

本节介绍缺页异常相关的知识；

页式管理机制通过页面目录，页面表，将每一个线性地址（虚拟地址）转换成物理地址，但并不是每一次CPU都能访问到相应的物理内存单元，因此这样映射便失败了，会产生缺页异常；

获得缺页异常的虚拟地址做对应的判断

dotraplinkage void __kprobes
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
	//error_code提示异常原因信息的出错代码
	struct vm_area_struct *vma;
	struct task_struct *tsk;
	unsigned long address;
	struct mm_struct *mm;
	int fault;
	int write = error_code & PF_WRITE;
	unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
					(write ? FAULT_FLAG_WRITE : 0);

	tsk = current;		//当前进程
	mm = tsk->mm;

	/* Get the faulting address: */
	address = read_cr2(); //从cr2获得了失败的address

	/*
	 * Detect and handle instructions that would cause a page fault for
	 * both a tracked kernel page and a userspace page.
	 */
	if (kmemcheck_active(regs))
		kmemcheck_hide(regs);
	prefetchw(&mm->mmap_sem);

	if (unlikely(kmmio_fault(regs, address)))
		return;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * This verifies that the fault happens in kernel space
	 * (error_code & 4) == 0, and that the fault was not a
	 * protection error (error_code & 9) == 0.
	 */
	if (unlikely(fault_in_kernel_space(address))) {		//如果地址超出了用户空间地址范围，则表明是vmalloc错误
		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
			if (vmalloc_fault(address) >= 0)		//同步页表
				return;

			if (kmemcheck_fault(regs, address, error_code))
				return;
		}

		/* Can handle a stale RO->RW TLB: */
		if (spurious_fault(error_code, address))
			return;

		/* kprobes don't want to hook the spurious faults: */
		if (notify_page_fault(regs))
			return;
		/*
		 * Don't take the mm semaphore here. If we fixup a prefetch
		 * fault we could otherwise deadlock:
		 */
		bad_area_nosemaphore(regs, error_code, address);

		return;
	}

	/* kprobes don't want to hook the spurious faults: */
	if (unlikely(notify_page_fault(regs)))
		return;
	/*
	 * It's safe to allow irq's after cr2 has been saved and the
	 * vmalloc fault has been handled.
	 *
	 * User-mode registers count as a user access even for any
	 * potential system fault or CPU buglet:
	 */
	if (user_mode_vm(regs)) {
		local_irq_enable();
		error_code |= PF_USER;
	} else {
		if (regs->flags & X86_EFLAGS_IF)
			local_irq_enable();
	}

	if (unlikely(error_code & PF_RSVD))
		pgtable_bad(regs, error_code, address);

	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);

	/*
	 * If we're in an interrupt, have no user context or are running
	 * in an atomic region then we must not take the fault:
	 */
	if (unlikely(in_atomic() || !mm)) {		//在中断期间，没有用户上下文，或者代码处于原子操作范围内，则不处理该异常
		bad_area_nosemaphore(regs, error_code, address);
		return;
	}

	/*
	 * When running in the kernel we expect faults to occur only to
	 * addresses in user space.  All other faults represent errors in
	 * the kernel and should generate an OOPS.  Unfortunately, in the
	 * case of an erroneous fault occurring in a code path which already
	 * holds mmap_sem we will deadlock attempting to validate the fault
	 * against the address space.  Luckily the kernel only validly
	 * references user space from well defined areas of code, which are
	 * listed in the exceptions table.
	 *
	 * As the vast majority of faults will be valid we will only perform
	 * the source reference check when there is a possibility of a
	 * deadlock. Attempt to lock the address space, if we cannot we then
	 * validate the source. If this is invalid we can skip the address
	 * space check, thus avoiding the deadlock:
	 */
	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
		if ((error_code & PF_USER) == 0 &&
		    !search_exception_tables(regs->ip)) {
			bad_area_nosemaphore(regs, error_code, address);
			return;
		}
retry:
		down_read(&mm->mmap_sem);
	} else {
		/*
		 * The above down_read_trylock() might have succeeded in
		 * which case we'll have missed the might_sleep() from
		 * down_read():
		 */
		might_sleep();
	}

	vma = find_vma(mm, address); //找到vma
	
	if (unlikely(!vma)) { //unlikely不太可能, 越界访问
		//vma为空时调用
		bad_area(regs, error_code, address);
		return;
	}


	if (likely(vma->vm_start <= address))	//地址映射已建立
		goto good_area;
	
	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { //并不是堆栈区间的空洞
		bad_area(regs, error_code, address);
		return;
	}

	if (error_code & PF_USER) {  //判断是在用户空间发生
		/*
		 * Accessing the stack below %sp is always a bug.
		 * The large cushion allows instructions like enter
		 * and pusha to work. ("enter $65535, $31" pushes
		 * 32 pointers and then decrements %sp by 65535.)
		 */
		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {  //看是否是在堆栈的周边
			bad_area(regs, error_code, address);  //不在堆栈区间周边
			return;
		}
	}
	if (unlikely(expand_stack(vma, address))) {  //先设置好vma的参数，扩大堆栈的地址
		bad_area(regs, error_code, address);
		return;
	}

	/*
	 * Ok, we have a good vm_area for this memory access, so
	 * we can handle it..
	 */
good_area:   //地址映射已经建立
	if (unlikely(access_error(error_code, vma))) {    //进一步部判断地址映射失败的原因
		bad_area_access_error(regs, error_code, address);
		return;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault:
	 */
	fault = handle_mm_fault(mm, vma, address, flags);  //建立pmd, pte的页表项内容

	if (unlikely(fault & VM_FAULT_ERROR)) {
		mm_fault_error(regs, error_code, address, fault);
		return;
	}

	/*
	 * Major/minor page fault accounting is only done on the
	 * initial attempt. If we go through a retry, it is extremely
	 * likely that the page will be found in page cache at that point.
	 */
	if (flags & FAULT_FLAG_ALLOW_RETRY) {
		if (fault & VM_FAULT_MAJOR) {
			tsk->maj_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
				      regs, address);
		} else {
			tsk->min_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
				      regs, address);
		}
		if (fault & VM_FAULT_RETRY) {
			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
			 * of starvation. */
			flags &= ~FAULT_FLAG_ALLOW_RETRY;
			goto retry;
		}
	}

	check_v8086_mode(regs, address, tsk);

	up_read(&mm->mmap_sem);
}

说明几点：

（1）可以使用获得当前进程task_struct数据结构进而得到mm_struct，通过find_vma()，找到的起始地址不高于给定的地址，说明地址在空洞之中（堆栈以下的供bark()动态申请的空间在vma中有vm_flags来标识VM_GROWSDOWN）；

（2）越界访问：用户空间访问到系统空间（虚拟地址大于给定的地址）；mmap可将文件映射到内存中，munmap撤销映射，但是用户可能访问已撤销的区间。如果空洞上方并不是在堆栈区，那也就是映射区，error_code的bit2位表示CPU处于用户模式发生的，这时对进程的task_struct结构进行设置，发生SIGSEGV，这时程序员经常会看到异常“Segment Fault”；

（3）用户堆栈的扩展：（堆栈以下的供bark()动态申请的空间在vma中有vm_flags来标识VM_GROWSDOWN），如果用户堆栈过小，有可能就会因祸得福，一般一次压入堆栈是4个字节，但是i386有一个指令pusha，会压入32个字节，因此检查准则就是%esp-32。超出这个范围的一定不是本情况。如果属于正常堆栈扩展，我就要使用expand_stack()来扩展堆栈；expand_stack()：task_struct中有个rlim的结构数组（记录各种资源的分配限制），如果扩展的大小（上限RILIMT_STACK）超过了可用于堆栈的资源，那就不扩展，返回-ENOMEM，表示没有存储空间分配了，expand_stack()返回非0（-ENOMEM），在do_page_fault()会转向bad_area，在expand_stack()中只是建立堆栈区的vm_area_struct结构，并未建立起新扩展的页面对物理内存的映射，该任务由good_area完成；

（4）用户堆栈的扩展：在good_area中，根据中断机制传过来的error_code进一步确定映射失败的原因并采取相应的措施。由于堆栈段允许写入(bit1=1，bit=0)，调用虚存管理handle_mm_fault，通过pmd_alloc来分配一个中间目录项，pte_alloc为分配内存页面建立映射做好准备；handle_pte_fault()中有set_pte()，从而映射已建立好；

（5）映射失败的原因：页面目录，页面表为空，映射关系尚未建立，或已撤销；相应的物理页面不在内存中；指令的访问方式与页面的访问权限不符，如写一个只读的页面；

越界访问

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
		       unsigned long address, int si_code)
{
	struct task_struct *tsk = current;

	/* User mode accesses just cause a SIGSEGV */
	if (error_code & PF_USER) {
		/*
		 * It's possible to have interrupts off here:
		 */
		local_irq_enable();

		/*
		 * Valid to do another page fault here because this one came
		 * from user space:
		 */
		if (is_prefetch(regs, error_code, address))
			return;

		if (is_errata100(regs, address))
			return;

		if (unlikely(show_unhandled_signals))
			show_signal_msg(regs, error_code, address, tsk);

		/* Kernel addresses are always protection faults: */
		tsk->thread.cr2		= address;
		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
		tsk->thread.trap_no	= 14;

		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); //设置tsk的信号为SIGSEGV

		return;
	}

	if (is_f00f_bug(regs, address))
		return;

	no_context(regs, error_code, address);
}

static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
		     unsigned long address)
{
	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
}

static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
	   unsigned long address, int si_code)
{
	struct mm_struct *mm = current->mm;

	/*
	 * Something tried to access memory that isn't in our memory map..
	 * Fix it, but check if it's kernel or user first..
	 */
	up_read(&mm->mmap_sem);  

	__bad_area_nosemaphore(regs, error_code, address, si_code);
}

static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
	__bad_area(regs, error_code, address, SEGV_MAPERR);
}

用户堆栈的扩展

int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
	return expand_upwards(vma, address);
}

int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
	int error;

	if (!(vma->vm_flags & VM_GROWSUP))  //不支持升长的
		return -EFAULT;

	/*
	 * We must make sure the anon_vma is allocated
	 * so that the anon_vma locking is not a noop.
	 */
	if (unlikely(anon_vma_prepare(vma)))
		return -ENOMEM;
	vma_lock_anon_vma(vma);

	/*
	 * vma->vm_start/vm_end cannot change under us because the caller
	 * is required to hold the mmap_sem in read mode.  We need the
	 * anon_vma lock to serialize against concurrent expand_stacks.
	 * Also guard against wrapping around to address 0.
	 */
	if (address < PAGE_ALIGN(address+4))
		address = PAGE_ALIGN(address+4);
	else {
		vma_unlock_anon_vma(vma);
		return -ENOMEM;
	}
	error = 0;

	/* Somebody else might have raced and expanded it already */
	if (address > vma->vm_end) {  //build
		unsigned long size, grow;

		size = address - vma->vm_start;
		grow = (address - vma->vm_end) >> PAGE_SHIFT;  //得到偏移

		error = acct_stack_growth(vma, size, grow);    //扩大堆栈空间
		if (!error) {
			vma->vm_end = address;//新的地址
			perf_event_mmap(vma);
		}
	}
	vma_unlock_anon_vma(vma);
	khugepaged_enter_vma_merge(vma);
	return error;
}

建立地址映射

static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)  //进一步判断地址映射失败的原因
{
	if (error_code & PF_WRITE) {   //写错误
		/* write, present and write, not present: */
		if (unlikely(!(vma->vm_flags & VM_WRITE)))  //不允许写入的
			return 1;   

		return 0;   <span style="font-family: Arial, Helvetica, sans-serif;">//堆栈是允许写入的</span><span style="font-family: Arial, Helvetica, sans-serif;">，那么将返回0，将会执行handle_mm_fault</span>

	}

	/* read, present: */
	if (unlikely(error_code & PF_PROT))   //被保护了，继续bad
		return 1;

	/* read, not present: */
	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
		return 1;

	return 0;
}

堆栈是允许写入的，建立地址映射

/*
 * By the time we get here, we already hold the mm semaphore
 */

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, unsigned int flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	__set_current_state(TASK_RUNNING);

	count_vm_event(PGFAULT);

	/* do counter updates before entering really critical section. */
	check_sync_rss_stat(current);

	if (unlikely(is_vm_hugetlb_page(vma)))
		return hugetlb_fault(mm, vma, address, flags);

	pgd = pgd_offset(mm, address); //获得页全局目录
	pud = pud_alloc(mm, pgd, address); //获得页上级目录, x86就是pgd
	if (!pud)
		return VM_FAULT_OOM;
	pmd = pmd_alloc(mm, pud, address); //获得页中间目录
	if (!pmd)
		return VM_FAULT_OOM;
	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { //
		if (!vma->vm_ops)
			return do_huge_pmd_anonymous_page(mm, vma, address,
							  pmd, flags);
	} else {
		pmd_t orig_pmd = *pmd;
		barrier();
		if (pmd_trans_huge(orig_pmd)) {
			if (flags & FAULT_FLAG_WRITE &&
			    !pmd_write(orig_pmd) &&
			    !pmd_trans_splitting(orig_pmd))
				return do_huge_pmd_wp_page(mm, vma, address,
							   pmd, orig_pmd);
			return 0;
		}
	}

	/*
	 * Use __pte_alloc instead of pte_alloc_map, because we can't
	 * run pte_offset_map on the pmd, if an huge pmd could
	 * materialize from under us from a different thread.
	 */
	if (unlikely(__pte_alloc(mm, vma, pmd, address)))
		return VM_FAULT_OOM;
	/* if an huge pmd materialized from under us just retry later */
	if (unlikely(pmd_trans_huge(*pmd)))
		return 0;
	/*
	 * A regular pmd is established and it can't morph into a huge pmd
	 * from under us anymore at this point because we hold the mmap_sem
	 * read mode and khugepaged takes it in write mode. So now it's
	 * safe to run pte_offset_map().
	 */
	pte = pte_offset_map(pmd, address);

	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}

int handle_pte_fault(struct mm_struct *mm,
		     struct vm_area_struct *vma, unsigned long address,
		     pte_t *pte, pmd_t *pmd, unsigned int flags)
{
	//pte指向相关页表项的指针，pte_t
	pte_t entry;
	spinlock_t *ptl;

	entry = *pte;
	if (!pte_present(entry)) {			//如果页不在物理内存中，则内核从头开始加载该页
		if (pte_none(entry)) {			//没有对应的页表项
			if (vma->vm_ops) {
				if (likely(vma->vm_ops->fault))		//没有对应的页表项
					return do_linear_fault(mm, vma, address,		//按需分配页
						pte, pmd, flags, entry);
			}
			return do_anonymous_page(mm, vma, address,  //使用这个来建立新的page表项
						 pte, pmd, flags);
		}
		if (pte_file(entry))
			return do_nonlinear_fault(mm, vma, address,
					pte, pmd, flags, entry);

		//如果该区域对页授予了写权限，而硬件的存取机制没有授予
		return do_swap_page(mm, vma, address,		
					//负责创建该页的副本，并插入到进程的页表中（在硬件层具备写权限），写时复制
					
					pte, pmd, flags, entry);
	}

	ptl = pte_lockptr(mm, pmd);
	spin_lock(ptl);
	if (unlikely(!pte_same(*pte, entry)))
		goto unlock;
	if (flags & FAULT_FLAG_WRITE) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address,
					pte, pmd, ptl, entry);
		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
		update_mmu_cache(vma, address, pte);
	} else {
		/*
		 * This is needed only for protection faults but the arch code
		 * is not yet telling us if this is a protection fault or not.
		 * This still avoids useless tlb flushes for .text page faults
		 * with threads.
		 */
		if (flags & FAULT_FLAG_WRITE)
			flush_tlb_fix_spurious_fault(vma, address);
	}
unlock:
	pte_unmap_unlock(pte, ptl);
	return 0;
}

说明几点

（1）do_anonymous_page建立新的page表项，获得新的页框；

（2）写时复制思想，如果该区域对页授予了写权限，而硬件的存取机制没有授予；do_swap_page负责创建该页的副本，并插入到进程的页表中（在硬件层具备写权限）；

【内存管理】缺页异常

标签：

原文地址：http://blog.csdn.net/skyuppour/article/details/46011801

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行