标签:
本节介绍缺页异常相关的知识;
页式管理机制通过页面目录,页面表,将每一个线性地址(虚拟地址)转换成物理地址,但并不是每一次CPU都能访问到相应的物理内存单元,因此这样映射便失败了,会产生缺页异常;
dotraplinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { //error_code提示异常原因信息的出错代码 struct vm_area_struct *vma; struct task_struct *tsk; unsigned long address; struct mm_struct *mm; int fault; int write = error_code & PF_WRITE; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | (write ? FAULT_FLAG_WRITE : 0); tsk = current; //当前进程 mm = tsk->mm; /* Get the faulting address: */ address = read_cr2(); //从cr2获得了失败的address /* * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. */ if (kmemcheck_active(regs)) kmemcheck_hide(regs); prefetchw(&mm->mmap_sem); if (unlikely(kmmio_fault(regs, address))) return; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * This verifies that the fault happens in kernel space * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { //如果地址超出了用户空间地址范围,则表明是vmalloc错误 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { if (vmalloc_fault(address) >= 0) //同步页表 return; if (kmemcheck_fault(regs, address, error_code)) return; } /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); return; } /* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * * User-mode registers count as a user access even for any * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { //在中断期间,没有用户上下文,或者代码处于原子操作范围内,则不处理该异常 bad_area_nosemaphore(regs, error_code, address); return; } /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in * the kernel and should generate an OOPS. Unfortunately, in the * case of an erroneous fault occurring in a code path which already * holds mmap_sem we will deadlock attempting to validate the fault * against the address space. Luckily the kernel only validly * references user space from well defined areas of code, which are * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform * the source reference check when there is a possibility of a * deadlock. Attempt to lock the address space, if we cannot we then * validate the source. If this is invalid we can skip the address * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && !search_exception_tables(regs->ip)) { bad_area_nosemaphore(regs, error_code, address); return; } retry: down_read(&mm->mmap_sem); } else { /* * The above down_read_trylock() might have succeeded in * which case we'll have missed the might_sleep() from * down_read(): */ might_sleep(); } vma = find_vma(mm, address); //找到vma if (unlikely(!vma)) { //unlikely不太可能, 越界访问 //vma为空时调用 bad_area(regs, error_code, address); return; } if (likely(vma->vm_start <= address)) //地址映射已建立 goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { //并不是堆栈区间的空洞 bad_area(regs, error_code, address); return; } if (error_code & PF_USER) { //判断是在用户空间发生 /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { //看是否是在堆栈的周边 bad_area(regs, error_code, address); //不在堆栈区间周边 return; } } if (unlikely(expand_stack(vma, address))) { //先设置好vma的参数,扩大堆栈的地址 bad_area(regs, error_code, address); return; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: //地址映射已经建立 if (unlikely(access_error(error_code, vma))) { //进一步部判断地址映射失败的原因 bad_area_access_error(regs, error_code, address); return; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault: */ fault = handle_mm_fault(mm, vma, address, flags); //建立pmd, pte的页表项内容 if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely * likely that the page will be found in page cache at that point. */ if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } if (fault & VM_FAULT_RETRY) { /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; goto retry; } } check_v8086_mode(regs, address, tsk); up_read(&mm->mmap_sem); }说明几点:
(1)可以使用获得当前进程task_struct数据结构进而得到mm_struct,通过find_vma(),找到的起始地址不高于给定的地址,说明地址在空洞之中(堆栈以下的供bark()动态申请的空间在vma中有vm_flags来标识VM_GROWSDOWN);
(2) 越界访问:用户空间访问到系统空间(虚拟地址大于给定的地址);mmap可将文件映射到内存中,munmap撤销映射,但是用户可能访问已撤销的区间。如果空洞上方并不是在堆栈区,那也就是映射区,error_code的bit2位表示CPU处于用户模式发生的,这时对进程的task_struct结构进行设置,发生SIGSEGV,这时程序员经常会看到异常“Segment Fault”;
(3)用户堆栈的扩展:(堆栈以下的供bark()动态申请的空间在vma中有vm_flags来标识VM_GROWSDOWN),如果用户堆栈过小,有可能就会因祸得福,一般一次压入堆栈是4个字节,但是i386有一个指令pusha,会压入32个字节,因此检查准则就是%esp-32。超出这个范围的一定不是本情况。如果属于正常堆栈扩展,我就要使用expand_stack()来扩展堆栈;expand_stack():task_struct中有个rlim的结构数组(记录各种资源的分配限制),如果扩展的大小(上限RILIMT_STACK)超过了可用于堆栈的资源,那就不扩展,返回-ENOMEM,表示没有存储空间分配了,expand_stack()返回非0(-ENOMEM),在do_page_fault()会转向bad_area,在expand_stack()中只是建立堆栈区的vm_area_struct结构,并未建立起新扩展的页面对物理内存的映射,该任务由good_area完成;
(4)用户堆栈的扩展:在good_area中,根据中断机制传过来的error_code进一步确定映射失败的原因并采取相应的措施。由于堆栈段允许写入(bit1=1,bit=0),调用虚存管理handle_mm_fault,通过pmd_alloc来分配一个中间目录项,pte_alloc为分配内存页面建立映射做好准备;handle_pte_fault()中有set_pte(),从而映射已建立好;
(5)映射失败的原因:页面目录,页面表为空,映射关系尚未建立,或已撤销;相应的物理页面不在内存中;指令的访问方式与页面的访问权限不符,如写一个只读的页面;
static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came * from user space: */ if (is_prefetch(regs, error_code, address)) return; if (is_errata100(regs, address)) return; if (unlikely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); /* Kernel addresses are always protection faults: */ tsk->thread.cr2 = address; tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); //设置tsk的信号为SIGSEGV return; } if (is_f00f_bug(regs, address)) return; no_context(regs, error_code, address); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, int si_code) { struct mm_struct *mm = current->mm; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ up_read(&mm->mmap_sem); __bad_area_nosemaphore(regs, error_code, address, si_code); } static noinline void bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); }
int expand_stack(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); }
int expand_upwards(struct vm_area_struct *vma, unsigned long address) { int error; if (!(vma->vm_flags & VM_GROWSUP)) //不支持升长的 return -EFAULT; /* * We must make sure the anon_vma is allocated * so that the anon_vma locking is not a noop. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. * Also guard against wrapping around to address 0. */ if (address < PAGE_ALIGN(address+4)) address = PAGE_ALIGN(address+4); else { vma_unlock_anon_vma(vma); return -ENOMEM; } error = 0; /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { //build unsigned long size, grow; size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; //得到偏移 error = acct_stack_growth(vma, size, grow); //扩大堆栈空间 if (!error) { vma->vm_end = address;//新的地址 perf_event_mmap(vma); } } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); return error; }
static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) //进一步判断地址映射失败的原因 { if (error_code & PF_WRITE) { //写错误 /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) //不允许写入的 return 1; return 0; <span style="font-family: Arial, Helvetica, sans-serif;">//堆栈是允许写入的</span><span style="font-family: Arial, Helvetica, sans-serif;">,那么将返回0,将会执行handle_mm_fault</span> } /* read, present: */ if (unlikely(error_code & PF_PROT)) //被保护了,继续bad return 1; /* read, not present: */ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) return 1; return 0; }
/* * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); //获得页全局目录 pud = pud_alloc(mm, pgd, address); //获得页上级目录, x86就是pgd if (!pud) return VM_FAULT_OOM; pmd = pmd_alloc(mm, pud, address); //获得页中间目录 if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { // if (!vma->vm_ops) return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); } else { pmd_t orig_pmd = *pmd; barrier(); if (pmd_trans_huge(orig_pmd)) { if (flags & FAULT_FLAG_WRITE && !pmd_write(orig_pmd) && !pmd_trans_splitting(orig_pmd)) return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); return 0; } } /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) return 0; /* * A regular pmd is established and it can't morph into a huge pmd * from under us anymore at this point because we hold the mmap_sem * read mode and khugepaged takes it in write mode. So now it's * safe to run pte_offset_map(). */ pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); }
int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { //pte指向相关页表项的指针,pte_t pte_t entry; spinlock_t *ptl; entry = *pte; if (!pte_present(entry)) { //如果页不在物理内存中,则内核从头开始加载该页 if (pte_none(entry)) { //没有对应的页表项 if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) //没有对应的页表项 return do_linear_fault(mm, vma, address, //按需分配页 pte, pmd, flags, entry); } return do_anonymous_page(mm, vma, address, //使用这个来建立新的page表项 pte, pmd, flags); } if (pte_file(entry)) return do_nonlinear_fault(mm, vma, address, pte, pmd, flags, entry); //如果该区域对页授予了写权限,而硬件的存取机制没有授予 return do_swap_page(mm, vma, address, //负责创建该页的副本,并插入到进程的页表中(在硬件层具备写权限),写时复制 pte, pmd, flags, entry); } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, pte); } else { /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vma, address); } unlock: pte_unmap_unlock(pte, ptl); return 0; }说明几点
(1)do_anonymous_page建立新的page表项,获得新的页框;
(2)写时复制思想,如果该区域对页授予了写权限,而硬件的存取机制没有授予;do_swap_page负责创建该页的副本,并插入到进程的页表中(在硬件层具备写权限);
标签:
原文地址:http://blog.csdn.net/skyuppour/article/details/46011801