diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fb0d3db162c20fba4dbc4b955cfaa086f6379b7a..66a3b6dc96ebb5fb51df5b3f5f6ad17923296386 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -915,7 +915,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * to keep reservation accounting consistent. */ hugetlb_set_vma_policy(&pseudo_vma, inode, index); - folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 63d225523ecf1de0619a1d7c0c4b0dd0479dec30..61ccbc81b341954d2f699abd20c8c5829cb21a11 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -798,7 +798,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); + unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, @@ -1118,7 +1118,7 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, - int avoid_reserve) + bool cow_from_owner) { return NULL; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index abe236201e68fdf2e8cc029499962e6684f5ee85..519d1682827e1c16efea73d7327f10072d7a14d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -818,6 +818,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1413,6 +1415,11 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, return 0; } +static inline int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a80b431a8f72fb7aa199a11b2620b552ff3e3acd..524f6d737d607b6f707ca5530a1e1ff64fd210a2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1254,69 +1254,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) hugetlb_dup_vma_private(vma); } -/* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(struct vm_area_struct *vma, long chg) -{ - if (vma->vm_flags & VM_NORESERVE) { - /* - * This address is already reserved by other process(chg == 0), - * so, we should decrement reserved count. Without decrementing, - * reserve count remains after releasing inode, because this - * allocated page will go into page cache and is regarded as - * coming from reserved pool in releasing step. Currently, we - * don't have any other solution to deal with this situation - * properly, so add work-around here. - */ - if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return true; - else - return false; - } - - /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) { - /* - * We know VM_NORESERVE is not set. Therefore, there SHOULD - * be a region map for all pages. The only situation where - * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reserves to - * use. This situation is indicated if chg != 0. - */ - if (chg) - return false; - else - return true; - } - - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Like the shared case above, a hole punch or truncate - * could have been performed on the private mapping. - * Examine the value of chg to determine if reserves - * actually exist or were previously consumed. - * Very Subtle - The value of chg comes from a previous - * call to vma_needs_reserves(). The reserve map for - * private mappings has different (opposite) semantics - * than that of shared mappings. vma_needs_reserves() - * has already taken this difference in semantics into - * account. Therefore, the meaning of chg is the same - * as in the shared case above. Code could easily be - * combined, but keeping it separate draws attention to - * subtle differences. - */ - if (chg) - return false; - else - return true; - } - - return false; -} - void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1397,8 +1334,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve, - long chg) + unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1407,15 +1343,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, int nid; /* - * A child process with MAP_PRIVATE mappings created by their parent - * have no page reserves. This check ensures that reservations are - * not "stolen". The child may still get SIGKILLed + * gbl_chg==1 means the allocation requires a new page that was not + * reserved before. Making sure there's at least one free page. */ - if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) - goto err; - - /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && !available_huge_pages(h)) + if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1433,11 +1364,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - mpol_cond_put(mpol); return folio; @@ -3138,70 +3064,92 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +typedef enum { + /* + * For either 0/1: we checked the per-vma resv map, and one resv + * count either can be reused (0), or an extra needed (1). + */ + MAP_CHG_REUSE = 0, + MAP_CHG_NEEDED = 1, + /* + * Cannot use per-vma resv count can be used, hence a new resv + * count is enforced. + * + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except + * that currently vma_needs_reservation() has an unwanted side + * effect to either use end() or commit() to complete the + * transaction. Hence it needs to differenciate from NEEDED. + */ + MAP_CHG_ENFORCED = 2, +} map_chg_state; + +/* + * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW + * faults of hugetlb private mappings on top of a non-page-cache folio (in + * which case even if there's a private vma resv map it won't cover such + * allocation). New call sites should (probably) never set it to true!! + * When it's set, the allocation will bypass all vma level reservations. + */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) + unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file)); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); - long gbl_chg; - int memcg_charge_ret, ret, idx; + long retval, gbl_chg; + map_chg_state map_chg; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; - bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); - /* - * Examine the region/reserve map to determine if the process - * has a reservation for the page to be allocated. A return - * code of zero indicates a reservation exists (no change). - */ - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); + + /* Whether we need a separate per-vma reservation? */ + if (cow_from_owner) { + /* + * Special case! Since it's a CoW on top of a reserved + * page, the private resv map doesn't count. So it cannot + * consume the per-vma resv map even if it's reserved. + */ + map_chg = MAP_CHG_ENFORCED; + } else { + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + retval = vma_needs_reservation(h, vma, addr); + if (retval < 0) + return ERR_PTR(-ENOMEM); + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* + * Whether we need a separate global reservation? + * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. - * Allocations for MAP_NORESERVE mappings also need to be - * checked against any subpool limit. + * Or if it can get one from the pool reservation directly. */ - if (map_chg || avoid_reserve) { + if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1, info); if (gbl_chg < 0) goto out_end_reservation; - + } else { /* - * Even though there was no reservation in the region/reserve - * map, there could be reservations associated with the - * subpool that can be used. This would be indicated if the - * return value of hugepage_subpool_get_pages() is zero. - * However, if avoid_reserve is specified we still avoid even - * the subpool reservations. + * If we have the vma reservation ready, no need for extra + * global reservation. */ - if (avoid_reserve) - gbl_chg = 1; + gbl_chg = 0; } - /* If this allocation is not consuming a reservation, charge it now. + /* + * If this allocation is not consuming a per-vma reservation, + * charge the hugetlb cgroup now. */ - deferred_reserve = map_chg || avoid_reserve; - if (deferred_reserve) { + if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -3215,7 +3163,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (file_in_dynamic_pool(info)) { bool reserved = false; - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) + if (!gbl_chg) reserved = true; folio = dynamic_pool_alloc_hugepage(info, h, reserved); if (!folio) @@ -3231,28 +3179,33 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + /* + * Either dequeued or buddy-allocated folio needs to add special + * mark to the folio when it consumes a global reservation. + */ + if (!gbl_chg) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + out: hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ - if (deferred_reserve) { + if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } @@ -3262,49 +3215,60 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!page_from_dynamic_pool(folio_page(folio, 0))) hugetlb_set_folio_subpool(folio, spool); - map_commit = vma_commit_reservation(h, vma, addr); - if (unlikely(map_chg > map_commit)) { + if (map_chg != MAP_CHG_ENFORCED) { + /* commit() is only needed if the map_chg is not enforced */ + retval = vma_commit_reservation(h, vma, addr); /* + * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND - * in hugetlb_reserve_pages for the same page. Also, + * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ - long rsv_adjust; + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { + long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1, info); - hugetlb_acct_memory(h, -rsv_adjust, info); - if (deferred_reserve) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), - pages_per_huge_page(h), folio); - spin_unlock_irq(&hugetlb_lock); + rsv_adjust = hugepage_subpool_put_pages(spool, 1, info); + hugetlb_acct_memory(h, -rsv_adjust, info); + if (map_chg) { + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), + folio); + spin_unlock_irq(&hugetlb_lock); + } } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); - mem_cgroup_put(memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ + + if (ret == -ENOMEM) { + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: - if (deferred_reserve) + if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || avoid_reserve) + if (map_chg) hugepage_subpool_put_pages(spool, 1, info); out_end_reservation: - vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg != MAP_CHG_ENFORCED) + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -5359,7 +5323,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); @@ -5774,18 +5738,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags, - struct folio *pagecache_folio, spinlock_t *ptl) +static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, + struct vm_fault *vmf) { - const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte = huge_ptep_get(ptep); + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + pte_t pte = huge_ptep_get(vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; - int outside_reserve = 0; + bool cow_from_owner = 0; vm_fault_t ret = 0; - unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; /* @@ -5808,7 +5772,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } @@ -5827,7 +5791,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5846,7 +5810,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) - outside_reserve = 1; + cow_from_owner = true; folio_get(old_folio); @@ -5854,8 +5818,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ - spin_unlock(ptl); - new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); + spin_unlock(vmf->ptl); + new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* @@ -5865,7 +5829,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ - if (outside_reserve) { + if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; @@ -5880,19 +5844,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * * Reacquire both after unmap operation. */ - idx = vma_hugecache_offset(h, vma, haddr); + idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, haddr); + unmap_ref_private(mm, vma, &old_folio->page, + vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, + huge_page_size(h)); + if (likely(vmf->pte && + pte_same(huge_ptep_get(vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -5910,42 +5876,42 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; + ret = __vmf_anon_prepare(vmf); + if (unlikely(ret)) goto out_release_all; - } - if (copy_user_large_folio(new_folio, old_folio, address, vma)) { + if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { ret = VM_FAULT_HWPOISON_LARGE; goto out_release_all; } __folio_mark_uptodate(new_folio); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, - haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, + vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); + if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ - huge_ptep_clear_flush(vma, haddr, ptep); + huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); - hugetlb_add_new_anon_rmap(new_folio, vma, haddr); + hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); - set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, + huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } - spin_unlock(ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* @@ -5953,12 +5919,12 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * unshare) */ if (new_folio != old_folio) - restore_reserve_on_error(h, vma, haddr, new_folio); + restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); - spin_lock(ptl); /* Caller expects lock to be held */ + spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; @@ -6010,39 +5976,21 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping return 0; } -static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, +static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, - pgoff_t idx, - unsigned int flags, - unsigned long haddr, - unsigned long addr, unsigned long reason) { u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = addr, - .flags = flags, - - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ - hugetlb_vma_unlock_read(vma); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vmf->vma); + hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, reason); + return handle_userfault(vmf, reason); } /* @@ -6062,22 +6010,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, return same; } -static vm_fault_t hugetlb_no_page(struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, - pte_t old_pte, unsigned int flags) +static vm_fault_t hugetlb_no_page(struct address_space *mapping, + struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; - spinlock_t *ptl; - unsigned long haddr = address & huge_page_mask(h); bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, idx); + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6096,10 +6041,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_hugetlb_folio(h, mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) + if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { @@ -6120,17 +6065,22 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } - folio = alloc_hugetlb_folio(vma, haddr, 0); + if (!(vma->vm_flags & VM_MAYSHARE)) { + ret = __vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; + } + + folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6144,18 +6094,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - if (hugetlb_pte_stable(h, mm, ptep, old_pte)) + if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + clear_huge_page(&folio->page, vmf->real_address, + pages_per_huge_page(h)); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(folio, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, + vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone @@ -6164,17 +6116,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, + folio); folio_put(folio); + ret = VM_FAULT_SIGBUS; goto out; } new_pagecache_folio = true; } else { folio_lock(folio); - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto backout_unlocked; - } anon_rmap = 1; } } else { @@ -6194,12 +6144,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } @@ -6210,23 +6159,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf->address); } - ptl = huge_pte_lock(h, mm, ptep); + vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ - if (!pte_same(huge_ptep_get(ptep), old_pte)) + if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) - hugetlb_add_new_anon_rmap(folio, vma, haddr); + hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) @@ -6235,17 +6184,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(old_pte))) + if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); - set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); + ret = hugetlb_wp(folio, vmf); } - spin_unlock(ptl); + spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages @@ -6258,14 +6207,22 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() is + * the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: - spin_unlock(ptl); + spin_unlock(vmf->ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); @@ -6299,23 +6256,27 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep, entry; - spinlock_t *ptl; vm_fault_t ret; u32 hash; - pgoff_t idx; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; - unsigned long haddr = address & huge_page_mask(h); + struct vm_fault vmf = { + .vma = vma, + .address = address & huge_page_mask(h), + .real_address = address, + .flags = flags, + .pgoff = vma_hugecache_offset(h, vma, + address & huge_page_mask(h)), + /* TODO: Track hugetlb faults using vm_fault */ - /* TODO: Handle faults under the VMA lock */ - if (flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + /* + * Some fields may not be initialized, be careful as it may + * be hard to debug if called functions make assumptions + */ + }; /* * Serialize hugepage allocation and instantiation, so that we don't @@ -6323,28 +6284,27 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Acquire vma lock before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. + * until finished with vmf.pte. This prevents huge_pmd_unshare from + * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { + vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); + if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } - entry = huge_ptep_get(ptep); - if (huge_pte_none_mostly(entry)) { - if (is_pte_marker(entry)) { + vmf.orig_pte = huge_ptep_get(vmf.pte); + if (huge_pte_none_mostly(vmf.orig_pte)) { + if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = - pte_marker_get(pte_to_swp_entry(entry)); + pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE; @@ -6358,21 +6318,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, - entry, flags); + return hugetlb_no_page(mapping, &vmf); } ret = 0; /* - * entry could be a migration/hwpoison entry at this point, so this - * check prevents the kernel from going below assuming that we have - * an active hugepage in pagecache. This goto expects the 2nd page - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will - * properly handle it. + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this + * point, so this check prevents the kernel from going below assuming + * that we have an active hugepage in pagecache. This goto expects + * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) + * check will properly handle it. */ - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) { + if (!pte_present(vmf.orig_pte)) { + if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6381,9 +6340,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); - migration_entry_wait_huge(vma, ptep); + migration_entry_wait_huge(vma, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6397,36 +6356,30 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { + if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf.address); - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, + vmf.pgoff); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } - ptl = huge_pte_lock(h, mm, ptep); + vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ - if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && - (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; - - spin_unlock(ptl); + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); @@ -6437,11 +6390,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * hugetlb_wp() requires page locks of pte_page(entry) and + * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ - folio = page_folio(pte_page(entry)); + folio = page_folio(pte_page(vmf.orig_pte)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; @@ -6451,24 +6404,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { - if (!huge_pte_write(entry)) { - ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_folio, ptl); + if (!huge_pte_write(vmf.orig_pte)) { + ret = hugetlb_wp(pagecache_folio, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { - entry = huge_pte_mkdirty(entry); + vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } - entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + vmf.orig_pte = pte_mkyoung(vmf.orig_pte); + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, haddr, ptep); + update_mmu_cache(vma, vmf.address, vmf.pte); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); @@ -6476,6 +6428,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } out_mutex: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() in + * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But @@ -6552,7 +6512,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; @@ -6594,7 +6554,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; diff --git a/mm/internal.h b/mm/internal.h index 46c5a8da9d728af9e02ee7287b8db7dd2f4eb671..2c2a057faff445c9f77402c04f5988067ac3bbd9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -325,7 +325,16 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) wake_up(wqh); } -vm_fault_t vmf_anon_prepare(struct vm_fault *vmf); +vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); +static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + vm_fault_t ret = __vmf_anon_prepare(vmf); + + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vmf->vma); + return ret; +} + vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 023d85aabfe80e74875d1c83182e7249fce5371e..ea46406e41a77cdbab61f68802b0f0ce138a49be 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1641,6 +1641,18 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } +#ifdef CONFIG_HUGETLB_PAGE +static bool memcg_accounts_hugetlb(void) +{ + return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; +} +#else /* CONFIG_HUGETLB_PAGE */ +static bool memcg_accounts_hugetlb(void) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ + static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { int i; @@ -8452,8 +8464,7 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, * but do not attempt to commit charge later (or cancel on error) either. */ if (mem_cgroup_disabled() || !memcg || - !cgroup_subsys_on_dfl(memory_cgrp_subsys) || - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + !cgroup_subsys_on_dfl(memory_cgrp_subsys) || !memcg_accounts_hugetlb()) return -EOPNOTSUPP; if (try_charge(memcg, gfp, nr_pages)) @@ -8462,6 +8473,40 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, return 0; } +/** + * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio + * @folio: folio being charged + * @gfp: reclaim mode + * + * This function is called when allocating a huge page folio, after the page has + * already been obtained and charged to the appropriate hugetlb cgroup + * controller (if it is enabled). + * + * Returns ENOMEM if the memcg is already full. + * Returns 0 if either the charge was successful, or if we skip the charging. + */ +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) +{ + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); + int ret = 0; + + /* + * Even memcg does not account for hugetlb, we still want to update + * system-level stats via lruvec_stat_mod_folio. Return 0, and skip + * charging the memcg. + */ + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + goto out; + + if (charge_memcg(folio, memcg, gfp)) + ret = -ENOMEM; + +out: + mem_cgroup_put(memcg); + return ret; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. diff --git a/mm/memory.c b/mm/memory.c index a6d146d684e8ec18740b669f540abdaa32b5bcc4..df623f354106e9d51731b3dfe9a2b6f93fbffb89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3245,7 +3245,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) } /** - * vmf_anon_prepare - Prepare to handle an anonymous fault. + * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a @@ -3259,7 +3259,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ -vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; @@ -3267,10 +3267,8 @@ vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - if (!mmap_read_trylock(vma->vm_mm)) { - vma_end_read(vma); + if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; - } } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM;