标签:
本节介绍缺页异常相关的知识;
页式管理机制通过页面目录,页面表,将每一个线性地址(虚拟地址)转换成物理地址,但并不是每一次CPU都能访问到相应的物理内存单元,因此这样映射便失败了,会产生缺页异常;
dotraplinkage void __kprobes
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
//error_code提示异常原因信息的出错代码
struct vm_area_struct *vma;
struct task_struct *tsk;
unsigned long address;
struct mm_struct *mm;
int fault;
int write = error_code & PF_WRITE;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
(write ? FAULT_FLAG_WRITE : 0);
tsk = current; //当前进程
mm = tsk->mm;
/* Get the faulting address: */
address = read_cr2(); //从cr2获得了失败的address
/*
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
if (kmemcheck_active(regs))
kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) { //如果地址超出了用户空间地址范围,则表明是vmalloc错误
if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
if (vmalloc_fault(address) >= 0) //同步页表
return;
if (kmemcheck_fault(regs, address, error_code))
return;
}
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (notify_page_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address);
return;
}
/* kprobes don't want to hook the spurious faults: */
if (unlikely(notify_page_fault(regs)))
return;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
* User-mode registers count as a user access even for any
* potential system fault or CPU buglet:
*/
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
*/
if (unlikely(in_atomic() || !mm)) { //在中断期间,没有用户上下文,或者代码处于原子操作范围内,则不处理该异常
bad_area_nosemaphore(regs, error_code, address);
return;
}
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
retry:
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case we'll have missed the might_sleep() from
* down_read():
*/
might_sleep();
}
vma = find_vma(mm, address); //找到vma
if (unlikely(!vma)) { //unlikely不太可能, 越界访问
//vma为空时调用
bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address)) //地址映射已建立
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { //并不是堆栈区间的空洞
bad_area(regs, error_code, address);
return;
}
if (error_code & PF_USER) { //判断是在用户空间发生
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { //看是否是在堆栈的周边
bad_area(regs, error_code, address); //不在堆栈区间周边
return;
}
}
if (unlikely(expand_stack(vma, address))) { //先设置好vma的参数,扩大堆栈的地址
bad_area(regs, error_code, address);
return;
}
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area: //地址映射已经建立
if (unlikely(access_error(error_code, vma))) { //进一步部判断地址映射失败的原因
bad_area_access_error(regs, error_code, address);
return;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
fault = handle_mm_fault(mm, vma, address, flags); //建立pmd, pte的页表项内容
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
/*
* Major/minor page fault accounting is only done on the
* initial attempt. If we go through a retry, it is extremely
* likely that the page will be found in page cache at that point.
*/
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
regs, address);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
regs, address);
}
if (fault & VM_FAULT_RETRY) {
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
goto retry;
}
}
check_v8086_mode(regs, address, tsk);
up_read(&mm->mmap_sem);
}说明几点:
(1)可以使用获得当前进程task_struct数据结构进而得到mm_struct,通过find_vma(),找到的起始地址不高于给定的地址,说明地址在空洞之中(堆栈以下的供bark()动态申请的空间在vma中有vm_flags来标识VM_GROWSDOWN);
(2) 越界访问:用户空间访问到系统空间(虚拟地址大于给定的地址);mmap可将文件映射到内存中,munmap撤销映射,但是用户可能访问已撤销的区间。如果空洞上方并不是在堆栈区,那也就是映射区,error_code的bit2位表示CPU处于用户模式发生的,这时对进程的task_struct结构进行设置,发生SIGSEGV,这时程序员经常会看到异常“Segment Fault”;
(3)用户堆栈的扩展:(堆栈以下的供bark()动态申请的空间在vma中有vm_flags来标识VM_GROWSDOWN),如果用户堆栈过小,有可能就会因祸得福,一般一次压入堆栈是4个字节,但是i386有一个指令pusha,会压入32个字节,因此检查准则就是%esp-32。超出这个范围的一定不是本情况。如果属于正常堆栈扩展,我就要使用expand_stack()来扩展堆栈;expand_stack():task_struct中有个rlim的结构数组(记录各种资源的分配限制),如果扩展的大小(上限RILIMT_STACK)超过了可用于堆栈的资源,那就不扩展,返回-ENOMEM,表示没有存储空间分配了,expand_stack()返回非0(-ENOMEM),在do_page_fault()会转向bad_area,在expand_stack()中只是建立堆栈区的vm_area_struct结构,并未建立起新扩展的页面对物理内存的映射,该任务由good_area完成;
(4)用户堆栈的扩展:在good_area中,根据中断机制传过来的error_code进一步确定映射失败的原因并采取相应的措施。由于堆栈段允许写入(bit1=1,bit=0),调用虚存管理handle_mm_fault,通过pmd_alloc来分配一个中间目录项,pte_alloc为分配内存页面建立映射做好准备;handle_pte_fault()中有set_pte(),从而映射已建立好;
(5)映射失败的原因:页面目录,页面表为空,映射关系尚未建立,或已撤销;相应的物理页面不在内存中;指令的访问方式与页面的访问权限不符,如写一个只读的页面;
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int si_code)
{
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
if (error_code & PF_USER) {
/*
* It's possible to have interrupts off here:
*/
local_irq_enable();
/*
* Valid to do another page fault here because this one came
* from user space:
*/
if (is_prefetch(regs, error_code, address))
return;
if (is_errata100(regs, address))
return;
if (unlikely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
/* Kernel addresses are always protection faults: */
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); //设置tsk的信号为SIGSEGV
return;
}
if (is_f00f_bug(regs, address))
return;
no_context(regs, error_code, address);
}
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int si_code)
{
struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
up_read(&mm->mmap_sem);
__bad_area_nosemaphore(regs, error_code, address, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
__bad_area(regs, error_code, address, SEGV_MAPERR);
}
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
return expand_upwards(vma, address);
}int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
if (!(vma->vm_flags & VM_GROWSUP)) //不支持升长的
return -EFAULT;
/*
* We must make sure the anon_vma is allocated
* so that the anon_vma locking is not a noop.
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
* Also guard against wrapping around to address 0.
*/
if (address < PAGE_ALIGN(address+4))
address = PAGE_ALIGN(address+4);
else {
vma_unlock_anon_vma(vma);
return -ENOMEM;
}
error = 0;
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) { //build
unsigned long size, grow;
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT; //得到偏移
error = acct_stack_growth(vma, size, grow); //扩大堆栈空间
if (!error) {
vma->vm_end = address;//新的地址
perf_event_mmap(vma);
}
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma);
return error;
}
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma) //进一步判断地址映射失败的原因
{
if (error_code & PF_WRITE) { //写错误
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE))) //不允许写入的
return 1;
return 0; <span style="font-family: Arial, Helvetica, sans-serif;">//堆栈是允许写入的</span><span style="font-family: Arial, Helvetica, sans-serif;">,那么将返回0,将会执行handle_mm_fault</span>
}
/* read, present: */
if (unlikely(error_code & PF_PROT)) //被保护了,继续bad
return 1;
/* read, not present: */
if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
return 1;
return 0;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address); //获得页全局目录
pud = pud_alloc(mm, pgd, address); //获得页上级目录, x86就是pgd
if (!pud)
return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address); //获得页中间目录
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { //
if (!vma->vm_ops)
return do_huge_pmd_anonymous_page(mm, vma, address,
pmd, flags);
} else {
pmd_t orig_pmd = *pmd;
barrier();
if (pmd_trans_huge(orig_pmd)) {
if (flags & FAULT_FLAG_WRITE &&
!pmd_write(orig_pmd) &&
!pmd_trans_splitting(orig_pmd))
return do_huge_pmd_wp_page(mm, vma, address,
pmd, orig_pmd);
return 0;
}
}
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
if (unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
//pte指向相关页表项的指针,pte_t
pte_t entry;
spinlock_t *ptl;
entry = *pte;
if (!pte_present(entry)) { //如果页不在物理内存中,则内核从头开始加载该页
if (pte_none(entry)) { //没有对应的页表项
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault)) //没有对应的页表项
return do_linear_fault(mm, vma, address, //按需分配页
pte, pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address, //使用这个来建立新的page表项
pte, pmd, flags);
}
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
pte, pmd, flags, entry);
//如果该区域对页授予了写权限,而硬件的存取机制没有授予
return do_swap_page(mm, vma, address,
//负责创建该页的副本,并插入到进程的页表中(在硬件层具备写权限),写时复制
pte, pmd, flags, entry);
}
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return 0;
}
说明几点
(1)do_anonymous_page建立新的page表项,获得新的页框;
(2)写时复制思想,如果该区域对页授予了写权限,而硬件的存取机制没有授予;do_swap_page负责创建该页的副本,并插入到进程的页表中(在硬件层具备写权限);
标签:
原文地址:http://blog.csdn.net/skyuppour/article/details/46011801