11.1 do_page_fault()缺页中断核心函数

2021-07-18 16:30:34 阅读：564 来源： 互联网

缺页中断处理的核心函数是do_page_fault()，该函数的实现和具体的体系结构相关。

[arch/arm/mm/fault.c]

static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
    struct task_struct *tsk;
    struct mm_struct *mm;
    int fault, sig, code;
    unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

    if (notify_page_fault(regs, fsr))
        return 0;

    tsk = current;
    mm  = tsk->mm;

    /* Enable interrupts if they were enabled in the parent context. */
    if (interrupts_enabled(regs))
        local_irq_enable();

    /*
     * If we're in an interrupt or have no user
     * context, we must not take the fault..
     */
    /*in_atomic()判断当前状态是否处于中断上下文或禁止抢占状态，如果是
    说明系统运行在原子上下文中(atomic context)，那么跳转到no_context
    标签处的__do_kernel_fault()函数。如果当前进程中没有struct mm_struct
    数据结构，说明这是一个内核线程，同样跳转到__do_kernel_fault函数中。
     这里可以看出，缺页中断是应用程序导致，内核不会触发缺页中断，只是会
    调用缺页中断相关的函数*/
    if (in_atomic() || !mm)
        goto no_context; /*内核panic*/

    /*如果是用户模式，那么flags置位FAULT_FLAG_USER*/
    if (user_mode(regs))
        flags |= FAULT_FLAG_USER;
    if (fsr & FSR_WRITE)
        flags |= FAULT_FLAG_WRITE;

    /*
     * As per x86, we may deadlock here.  However, since the kernel only
     * validly references user space from well defined areas of the code,
     * we can bug out early if this is from code which shouldn't.
     */
    /*down_read_trylock()函数判断当前进程的mm->mmap_sem读写信号量是否可以获取，
    返回1则表示成功获得锁，返回0则表示锁已被别人占用。mm->mmap_sem锁被别人占用
    时要区分两种情况，一种是发生在内核空间，另一种是发生在用户空间。发生在用户空间
    的情况可以调用down_read()来睡眠等待锁持有者释放该锁；发生在内核空间时，如果
    没有在exception_tables查询到该地址，那么跳转到no_context*/
    if (!down_read_trylock(&mm->mmap_sem)) {
         /*search_exception_tables函数的作用不懂*/
        if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
            goto no_context;
retry:
        down_read(&mm->mmap_sem);
    } else {
        /*
         * The above down_read_trylock() might have succeeded in
         * which case, we'll have missed the might_sleep() from
         * down_read()
         */
        might_sleep();
#ifdef CONFIG_DEBUG_VM
        if (!user_mode(regs) &&
            !search_exception_tables(regs->ARM_pc))
            goto no_context;
#endif
    }
    /*__do_page_fault()函数，立即查看下面的讲解。此函数通常返回VM_FAULT类型,下面有介绍
     作用: 1. 判断addr是否在vma中 2.判断权限是否正确 3. 调用handle_mm_fault*/
    fault = __do_page_fault(mm, addr, fsr, flags, tsk);

    /* If we need to retry but a fatal signal is pending, handle the
     * signal first. We do not need to release the mmap_sem because
     * it would already be released in __lock_page_or_retry in
     * mm/filemap.c. */
    if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
        return 0;

    /*
     * Major/minor page fault accounting is only done on the
     * initial attempt. If we go through a retry, it is extremely
     * likely that the page will be found in page cache at that point.
     */

    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
    if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
        if (fault & VM_FAULT_MAJOR) {
            tsk->maj_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
                    regs, addr);
        } else {
            tsk->min_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
                    regs, addr);
        }
        if (fault & VM_FAULT_RETRY) {
            /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
            * of starvation. */
            flags &= ~FAULT_FLAG_ALLOW_RETRY;
            flags |= FAULT_FLAG_TRIED;
            goto retry;
        }
    }

    up_read(&mm->mmap_sem);

    /*
     * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
     */
    /*如果没有返回(VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)错误类型，
    那么说明缺页中断就处理完成。*/
    if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
        return 0;

    /*
     * If we are in kernel mode at this point, we
     * have no context to handle this fault with.
     */
    /*__do_page_fault()函数返回错误且当前处理内核模式，那么跳转
    __do_kernel_fault()来处理。*/
    if (!user_mode(regs))
        goto no_context;

    /*如果错误类型是VM_FAULT_OOM,说明当前系统没有足够的
    内存，那么调用pagefault_nout_of_memory()函数来触发OOM机制*/
    if (fault & VM_FAULT_OOM) {
        /*
         * We ran out of memory, call the OOM killer, and return to
         * userspace (which will retry the fault, or kill us if we
         * got oom-killed)
         */
        pagefault_out_of_memory();
        return 0;
    }

    if (fault & VM_FAULT_SIGBUS) {
        /*
         * We had some memory, but were unable to
         * successfully fix up this page fault.
         */
        sig = SIGBUS;
        code = BUS_ADRERR;
    } else {
        /*
         * Something tried to access memory that
         * isn't in our memory map..
         */
        sig = SIGSEGV;
        code = fault == VM_FAULT_BADACCESS ?
            SEGV_ACCERR : SEGV_MAPERR;
    }
    /*调用__do_user_fault()来给用户进程发信号（段错误），因为这时内核已经无能为力了。下面立即查看此函数的实现*/
    __do_user_fault(tsk, addr, fsr, sig, code, regs);
    return 0;

no_context:
    /*错误发生在内核模式，如果内核无法处理，那么调用__do_kernel_fault函数发送Oops错误。查看下面此函数的实现*/
    __do_kernel_fault(mm, addr, fsr, regs);
    return 0;
}

__do_page_fault()函数:

[arch/arm/mm/fault.c]

[do_page_fault()->__do_page_fault()]

static int __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
        unsigned int flags, struct task_struct *tsk)
{
    struct vm_area_struct *vma;
    int fault;

    /*首先通过失效地址addr来查找vma，如果find_vma()找不到vma，说明addr
    地址还没有在进程地址空间中，返回VM_FAULT_BADMAP错误。*/
    vma = find_vma(mm, addr);
    fault = VM_FAULT_BADMAP;
    if (unlikely(!vma))
        goto out;
    if (unlikely(vma->vm_start > addr))
        goto check_stack;

    /*
     * Ok, we have a good vm_area for this
     * memory access, so we can handle it.
     */
good_area:
    /*access_error()判断VMA是否具备可写或可执行等权限。如果发生一个写错误
    的缺页中断，首先判断vma属性是否具有可写属性，如果没有，则返回
    VM_FAULT_BADACCESS错误。*/
    if (access_error(fsr, vma)) {
        fault = VM_FAULT_BADACCESS;
        goto out;
    }
    
    /*handle_mm_fault()是缺页中断的核心处理函数，等哈介绍*/
    return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags);

check_stack:
    /* Don't allow expansion below FIRST_USER_ADDRESS */
    if (vma->vm_flags & VM_GROWSDOWN &&
        addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr))
        goto good_area;
out:
    return fault;
}
回到do_page_fault()函数

PAGE_FAULT类型

[include/linux/mm.h]

/*
 * Different kinds of faults, as returned by handle_mm_fault().
 * Used to decide whether a process gets delivered SIGBUS or
 * just gets major/minor fault counters bumped up.
 */

#define VM_FAULT_MINOR  0 /* For backwards compat. Remove me quickly. */

#define VM_FAULT_OOM    0x0001
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR  0x0004
#define VM_FAULT_WRITE  0x0008  /* Special case for get_user_pages */
#define VM_FAULT_HWPOISON 0x0010    /* Hit poisoned small page */
#define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
#define VM_FAULT_SIGSEGV 0x0040

#define VM_FAULT_NOPAGE 0x0100  /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200  /* ->fault locked the returned page */
#define VM_FAULT_RETRY  0x0400  /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800    /* huge page fault failed, fall back to small */

#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */

#define VM_FAULT_ERROR  (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
             VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
             VM_FAULT_FALLBACK)

__do_user_fault()函数：段错误返回

[do_page_fault()->__do_user_fault()]

/*
 * Something tried to access memory that isn't in our memory map..
 * User mode accesses just cause a SIGSEGV
 */
static void
__do_user_fault(struct task_struct *tsk, unsigned long addr,
        unsigned int fsr, unsigned int sig, int code,
        struct pt_regs *regs)
{
    struct siginfo si;

#ifdef CONFIG_DEBUG_USER
    if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
        ((user_debug & UDBG_BUS)  && (sig == SIGBUS))) {
        printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n",
               tsk->comm, sig, addr, fsr);
        show_pte(tsk->mm, addr);
        show_regs(regs);
    }
#endif

    tsk->thread.address = addr;
    tsk->thread.error_code = fsr;
    tsk->thread.trap_no = 14;
    si.si_signo = sig;
    si.si_errno = 0;
    si.si_code = code;
    si.si_addr = (void __user *)addr;
    force_sig_info(sig, &si, tsk);
}
回到do_page_fault函数

__do_kernel_fault()函数

/*
 * Oops.  The kernel tried to access some page that wasn't present.
 */
static void
__do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
          struct pt_regs *regs)
{
    /*
     * Are we prepared to handle this kernel fault?
     */
    if (fixup_exception(regs))
        return;

    /*
     * No handler, we'll have to terminate things with extreme prejudice.
     */
    bust_spinlocks(1);
    pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
         (addr < PAGE_SIZE) ? "NULL pointer dereference" :
         "paging request", addr);

    show_pte(mm, addr);
    die("Oops", regs, fsr);
    bust_spinlocks(0);
    do_exit(SIGKILL);
}
回到do_page_fault函数

__handle_mm_fault()函数:

handle_mm_fault()函数的核心函数是__handle_mm_fault(),它的实现在mm/memory.c

[do_page_fault()->__do_page_fault()->handle_mm_fault()->__handle_mm_fault()]

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, unsigned int flags)
{
    pgd_t *pgd;
    pud_t *pud;
    pmd_t *pmd;
    pte_t *pte;

    if (unlikely(is_vm_hugetlb_page(vma)))
        return hugetlb_fault(mm, vma, address, flags);

    /*pgd_offset(mm,address)宏获取addr对应在当前进程页表的PGD页面目录项。*/
    pgd = pgd_offset(mm, address);
    /*pud_alloc(mm, pgd, address)宏获取对应的PUD表项，如果PUD表项为空，则返回VM_FAULT_OOM错误*/
    pud = pud_alloc(mm, pgd, address);
    if (!pud)
        return VM_FAULT_OOM;
    /*同样的方法获取PMD*/
    pmd = pmd_alloc(mm, pud, address);
    if (!pmd)
        return VM_FAULT_OOM;
    if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
        int ret = VM_FAULT_FALLBACK;
        if (!vma->vm_ops)
            ret = do_huge_pmd_anonymous_page(mm, vma, address,
                    pmd, flags);
        if (!(ret & VM_FAULT_FALLBACK))
            return ret;
    } else {
        pmd_t orig_pmd = *pmd;
        int ret;

        barrier();
        if (pmd_trans_huge(orig_pmd)) {
            unsigned int dirty = flags & FAULT_FLAG_WRITE;

            /*
             * If the pmd is splitting, return and retry the
             * the fault.  Alternative: wait until the split
             * is done, and goto retry.
             */
            if (pmd_trans_splitting(orig_pmd))
                return 0;

            if (pmd_protnone(orig_pmd))
                return do_huge_pmd_numa_page(mm, vma, address,
                                 orig_pmd, pmd);

            if (dirty && !pmd_write(orig_pmd)) {
                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                              orig_pmd);
                if (!(ret & VM_FAULT_FALLBACK))
                    return ret;
            } else {
                huge_pmd_set_accessed(mm, vma, address, pmd,
                              orig_pmd, dirty);
                return 0;
            }
        }
    }

    /*
     * Use __pte_alloc instead of pte_alloc_map, because we can't
     * run pte_offset_map on the pmd, if an huge pmd could
     * materialize from under us from a different thread.
     */
    /*如果address对应的pte不存在，则会分配一个pte(大小为4K)，然后调用pmd_populate函数，将
    刚分配的pte的地址填入mm->pgd+page_index(address)的地址*/
    if (unlikely(pmd_none(*pmd)) &&
        unlikely(__pte_alloc(mm, vma, pmd, address)))
        return VM_FAULT_OOM;
    /* if an huge pmd materialized from under us just retry later */
    if (unlikely(pmd_trans_huge(*pmd)))
        return 0;
    /*
     * A regular pmd is established and it can't morph into a huge pmd
     * from under us anymore at this point because we hold the mmap_sem
     * read mode and khugepaged takes it in write mode. So now it's
     * safe to run pte_offset_map().
     */
    /*获取address对应的pte表项*/
    pte = pte_offset_map(pmd, address);
    
    /*调用此函数，下面具体分析*/
    return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
/*回到handle_mm_fault()函数*/

handle_pte_fault()函数:

【do_page_fault()->__do_page_fault()->handle_mm_fault()->__handle_mm_fault()->handle_pte_fault()】

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int handle_pte_fault(struct mm_struct *mm,
             struct vm_area_struct *vma, unsigned long address,
             pte_t *pte, pmd_t *pmd, unsigned int flags)
{
    pte_t entry;
    spinlock_t *ptl;

    /*
     * some architectures can have larger ptes than wordsize,
     * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
     * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
     * The code below just needs a consistent view for the ifs and
     * we later double check anyway with the ptl lock held. So here
     * a barrier will do.
     */
    /*上面的注释说明有的处理器体系结构会大于8Byte的pte表项，例如ppc44x定义了CONFIG_PTE_64BIT
    和CONFIG_32BIT，所以READ_ONCE()和ACCESS_ONCE()并不保证访问的原子性，所以这里需要一个内存
    屏障以保证正确读取PTE表项内容才会执行后面的判断语句。*/
    entry = *pte;
    barrier();
    
    /*pte_present为0的情况，页不在内存中，即pte表项中的LPTE_PRESENT位没有置位，
    所以pte还没有映射物理页面，这是真正的缺页*/
    if (!pte_present(entry)) {
        /*
        (1)如果pte内容为空，即pte_none()
            ** 对于文件映射，通常VMA的vm_ops操作函数定义了fault()函数指针，那么调用do_fault()函数。
            ** 对于匿名也变，调用do_anonymous_page()函数
        */
        if (pte_none(entry)) {
            if (vma->vm_ops) {
                if (likely(vma->vm_ops->fault))
                    return do_fault(mm, vma, address, pte,
                            pmd, flags, entry);
            }
            return do_anonymous_page(mm, vma, address,
                         pte, pmd, flags);
        }
        /*(2) 如果pte内容不为空且PRESENT没有置位，说明该页被交换到swap分区，则
                调用do_swap_page()函数*/
        return do_swap_page(mm, vma, address,
                    pte, pmd, flags, entry);
    }

    if (pte_protnone(entry))
        return do_numa_page(mm, vma, address, entry, pte, pmd);
    
    /*这里是pte有映射物理页面，但因为之前的pte设置了只读，现在需要可写操作，
    所以触发了写时复制缺页中断，例如父子进程之间共享的内存，当其中一方需要写入新内容时，
    就会触发写时复制。*/
    ptl = pte_lockptr(mm, pmd);
    spin_lock(ptl);
    if (unlikely(!pte_same(*pte, entry)))
        goto unlock;
    /*如果传进来的flag设置了可写的属性且当前pte是只读的，那么调用do_wp_page()
    函数并返回*/
    if (flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry))/*如果传进来的flag设置了可写的属性且当前PTE是只读的，那么调用do_wp_page()函数*/
            return do_wp_page(mm, vma, address,
                    pte, pmd, ptl, entry);
        entry = pte_mkdirty(entry);
    }
    /*pte_mkyoung对于x86体系结构是设置_PAGE_ACCESSED位，这相对简单些。对于ARM体系结构
    是设置Linux版本的页表中PTE页表项的L_PTE_YOUNG位，是否需要写入硬件版本的页表由set_pte_at
    函数来决定*/
    entry = pte_mkyoung(entry);
    /*如果pte内容发生变化，则需要把新的内容写入到pte表项中，并且要flush对应的TLB和cache*/
    if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
        update_mmu_cache(vma, address, pte);
    } else {
        /*
         * This is needed only for protection faults but the arch code
         * is not yet telling us if this is a protection fault or not.
         * This still avoids useless tlb flushes for .text page faults
         * with threads.
         */
        if (flags & FAULT_FLAG_WRITE)
            flush_tlb_fix_spurious_fault(vma, address);
    }
unlock:
    pte_unmap_unlock(pte, ptl);
    return 0;
}

标签：do,mm,fault,pte,VM,pmd,FAULT,缺页
来源： https://blog.csdn.net/dai_xiangjun/article/details/118863423

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

11.1 do_page_fault()缺页中断核心函数