From 319e4dacbc75d28d498842462a4308ae914478de Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 25 Feb 2025 19:08:44 +0800 Subject: [PATCH 01/19] hugetlb: move vm_fault declaration to the top of hugetlb_fault() mainline inclusion from mainline-v6.8-rc5 commit 0ca22723e3ffe0d539c5d72603dded8fe6924a89 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ca22723e3ffe0d539c5d72603dded8fe6924a89 -------------------------------- hugetlb_fault() currently defines a vm_fault to pass to the generic handle_userfault() function. We can move this definition to the top of hugetlb_fault() so that it can be used throughout the rest of the hugetlb fault path. This will help cleanup a number of excess variables and function arguments throughout the stack. Also, since vm_fault already has space to store the page offset, use that instead and get rid of idx. Link: https://lkml.kernel.org/r/20240221234732.187629-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Conflicts: mm/hugetlb.c [Context conflicts in mm/hugetlb.c due to miss commit d61ea1cb009532dcbd77a9d44b812704cec60146] Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a80b431a8f72..f9e1451544fd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6303,13 +6303,25 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; vm_fault_t ret; u32 hash; - pgoff_t idx; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + .pgoff = vma_hugecache_offset(h, vma, haddr), + /* TODO: Track hugetlb faults using vm_fault */ + + /* + * Some fields may not be initialized, be careful as it may + * be hard to debug if called functions make assumptions + */ + }; /* TODO: Handle faults under the VMA lock */ if (flags & FAULT_FLAG_VMA_LOCK) { @@ -6323,8 +6335,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -6358,8 +6369,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, - entry, flags); + + return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, + ptep, entry, flags); } ret = 0; @@ -6405,7 +6417,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, + vmf.pgoff); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } @@ -6419,13 +6432,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; - spin_unlock(ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); -- Gitee From 870dcbdd71d1dda632cda53c662ee26d5e50cde3 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 26 Feb 2025 10:06:23 +0800 Subject: [PATCH 02/19] hugetlb: pass struct vm_fault through to hugetlb_handle_userfault() mainline inclusion from mainline-v6.8-rc5 commit 7dac0ec8fa3f4977d04974e94806dfa8bdac7ed2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7dac0ec8fa3f4977d04974e94806dfa8bdac7ed2 -------------------------------- Now that hugetlb_fault() has a struct vm_fault, have hugetlb_handle_userfault() use it instead of creating one of its own. This lets us reduce the number of arguments passed to hugetlb_handle_userfault() from 7 to 3, cleaning up the code and stack. Link: https://lkml.kernel.org/r/20240221234732.187629-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f9e1451544fd..b2c9e0454ce9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6010,39 +6010,21 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping return 0; } -static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, +static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, - pgoff_t idx, - unsigned int flags, - unsigned long haddr, - unsigned long addr, unsigned long reason) { u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = addr, - .flags = flags, - - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ - hugetlb_vma_unlock_read(vma); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vmf->vma); + hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, reason); + return handle_userfault(vmf, reason); } /* @@ -6066,7 +6048,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, - pte_t old_pte, unsigned int flags) + pte_t old_pte, unsigned int flags, + struct vm_fault *vmf) { struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; @@ -6125,8 +6108,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } @@ -6198,8 +6180,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } @@ -6369,9 +6350,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, - ptep, entry, flags); + ptep, entry, flags, &vmf); } ret = 0; -- Gitee From 63b853701c242741d4ac8c1b2b50cd3a2be0c4bf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 26 Feb 2025 10:23:37 +0800 Subject: [PATCH 03/19] hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare() mainline inclusion from mainline-v6.8-rc5 commit 9acad7ba3e25d11f4c96df1b7312ae89e6faca5c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9acad7ba3e25d11f4c96df1b7312ae89e6faca5c -------------------------------- hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare() hugetlb_no_page() and hugetlb_wp() call anon_vma_prepare(). In preparation for hugetlb to safely handle faults under the VMA lock, use vmf_anon_prepare() here instead. Additionally, passing hugetlb_wp() the vm_fault struct from hugetlb_fault() works toward cleaning up the hugetlb code and function stack. Link: https://lkml.kernel.org/r/20240221234732.187629-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b2c9e0454ce9..df75f1b39558 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5776,7 +5776,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, - struct folio *pagecache_folio, spinlock_t *ptl) + struct folio *pagecache_folio, spinlock_t *ptl, + struct vm_fault *vmf) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(ptep); @@ -5910,10 +5911,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) goto out_release_all; - } if (copy_user_large_folio(new_folio, old_folio, address, vma)) { ret = VM_FAULT_HWPOISON_LARGE; @@ -6153,10 +6153,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_pagecache_folio = true; } else { folio_lock(folio); - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; + + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) goto backout_unlocked; - } anon_rmap = 1; } } else { @@ -6223,7 +6223,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf); } spin_unlock(ptl); @@ -6439,7 +6439,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_folio, ptl); + pagecache_folio, ptl, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); -- Gitee From 95075019ad4285239609a5c1bd034a006433c19f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 26 Feb 2025 10:29:17 +0800 Subject: [PATCH 04/19] hugetlb: allow faults to be handled under the VMA lock mainline inclusion from mainline-v6.8-rc5 commit 7c43a553792a1701affeef20959dfb2ccb26dcee category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7c43a553792a1701affeef20959dfb2ccb26dcee -------------------------------- Hugetlb can now safely handle faults under the VMA lock, so allow it to do so. This patch may cause ltp hugemmap10 to "fail". Hugemmap10 tests hugetlb counters, and expects the counters to remain unchanged on failure to handle a fault. In hugetlb_no_page(), vmf_anon_prepare() may bailout with no anon_vma under the VMA lock after allocating a folio for the hugepage. In free_huge_folio(), this folio is completely freed on bailout iff there is a surplus of hugetlb pages. This will remove a folio off the freelist and decrement the number of hugepages while ltp expects these counters to remain unchanged on failure. Originally this could only happen due to OOM failures, but now it may also occur after we allocate a hugetlb folio without a suitable anon_vma under the VMA lock. This should only happen for the first freshly allocated hugepage in this vma. Link: https://lkml.kernel.org/r/20240221234732.187629-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index df75f1b39558..aff0fd3c9989 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6304,12 +6304,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ }; - /* TODO: Handle faults under the VMA lock */ - if (flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate -- Gitee From f12cf7f6c4a28ed07b2c94190cd57bcc52482882 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 31 Mar 2025 11:18:32 +0800 Subject: [PATCH 05/19] hugetlb: check for anon_vma prior to folio allocation mainline inclusion from mainline-v6.9-rc6 commit 37641efaa3faa4b8292aba4bbd7d71c0b703a239 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37641efaa3faa4b8292aba4bbd7d71c0b703a239 -------------------------------- Commit 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare()") may bailout after allocating a folio if we do not hold the mmap lock. When this occurs, vmf_anon_prepare() will release the vma lock. Hugetlb then attempts to call restore_reserve_on_error(), which depends on the vma lock being held. We can move vmf_anon_prepare() prior to the folio allocation in order to avoid calling restore_reserve_on_error() without the vma lock. Link: https://lkml.kernel.org/r/ZiFqSrSRLhIV91og@fedora Fixes: 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare()") Reported-by: syzbot+ad1b592fc4483655438b@syzkaller.appspotmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aff0fd3c9989..57ba55f3f6f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6112,6 +6112,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, VM_UFFD_MISSING); } + if (!(vma->vm_flags & VM_MAYSHARE)) { + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; + } + folio = alloc_hugetlb_folio(vma, haddr, 0); if (IS_ERR(folio)) { /* @@ -6148,15 +6154,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, */ restore_reserve_on_error(h, vma, haddr, folio); folio_put(folio); + ret = VM_FAULT_SIGBUS; goto out; } new_pagecache_folio = true; } else { folio_lock(folio); - - ret = vmf_anon_prepare(vmf); - if (unlikely(ret)) - goto backout_unlocked; anon_rmap = 1; } } else { -- Gitee From d1a3115a7998854f4a1140e4ffcc2a7f4d1e6e94 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 26 Feb 2025 14:08:25 +0800 Subject: [PATCH 06/19] hugetlb: convert hugetlb_fault() to use struct vm_fault mainline inclusion from mainline-v6.9-rc6 commit 9b42fa16195fb471bf3b7849803c0a7d3e7620f2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9b42fa16195fb471bf3b7849803c0a7d3e7620f2 -------------------------------- Patch series "Hugetlb fault path to use struct vm_fault", v2. This patchset converts the hugetlb fault path to use struct vm_fault. This helps make the code more readable, and alleviates the stack by allowing us to consolidate many fault-related variables into an individual pointer. This patch (of 3): Now that hugetlb_fault() has a vm_fault available for fault tracking, use it throughout. This cleans up the code by removing 2 variables, and prepares hugetlb_fault() to take in a struct vm_fault argument. Link: https://lkml.kernel.org/r/20240401202651.31440-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20240401202651.31440-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Conflicts: mm/hugetlb.c [Context conflicts in mm/hugetlb.c due to miss commit d61ea1cb009532dcbd77a9d44b812704cec60146] Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 80 +++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57ba55f3f6f5..d99b4937fbc0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6283,8 +6283,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep, entry; - spinlock_t *ptl; vm_fault_t ret; u32 hash; struct folio *folio = NULL; @@ -6292,13 +6290,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; - unsigned long haddr = address & huge_page_mask(h); struct vm_fault vmf = { .vma = vma, - .address = haddr, + .address = address & huge_page_mask(h), .real_address = address, .flags = flags, - .pgoff = vma_hugecache_offset(h, vma, haddr), + .pgoff = vma_hugecache_offset(h, vma, + address & huge_page_mask(h)), /* TODO: Track hugetlb faults using vm_fault */ /* @@ -6318,22 +6316,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Acquire vma lock before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. + * until finished with vmf.pte. This prevents huge_pmd_unshare from + * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { + vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); + if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } - entry = huge_ptep_get(ptep); - if (huge_pte_none_mostly(entry)) { - if (is_pte_marker(entry)) { + vmf.orig_pte = huge_ptep_get(vmf.pte); + if (huge_pte_none_mostly(vmf.orig_pte)) { + if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = - pte_marker_get(pte_to_swp_entry(entry)); + pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE; @@ -6348,20 +6346,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * mutex internally, which make us return immediately. */ return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, - ptep, entry, flags, &vmf); + vmf.pte, vmf.orig_pte, flags, &vmf); } ret = 0; /* - * entry could be a migration/hwpoison entry at this point, so this - * check prevents the kernel from going below assuming that we have - * an active hugepage in pagecache. This goto expects the 2nd page - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will - * properly handle it. + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this + * point, so this check prevents the kernel from going below assuming + * that we have an active hugepage in pagecache. This goto expects + * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) + * check will properly handle it. */ - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) { + if (!pte_present(vmf.orig_pte)) { + if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6370,9 +6368,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); - migration_entry_wait_huge(vma, ptep); + migration_entry_wait_huge(vma, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6386,13 +6384,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { + if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf.address); pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, vmf.pgoff); @@ -6400,16 +6398,16 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pagecache_folio = NULL; } - ptl = huge_pte_lock(h, mm, ptep); + vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ - if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && - (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - spin_unlock(ptl); + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); @@ -6420,11 +6418,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * hugetlb_wp() requires page locks of pte_page(entry) and + * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ - folio = page_folio(pte_page(entry)); + folio = page_folio(pte_page(vmf.orig_pte)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; @@ -6434,24 +6432,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { - if (!huge_pte_write(entry)) { - ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_folio, ptl, &vmf); + if (!huge_pte_write(vmf.orig_pte)) { + ret = hugetlb_wp(mm, vma, address, vmf.pte, flags, + pagecache_folio, vmf.ptl, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { - entry = huge_pte_mkdirty(entry); + vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } - entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + vmf.orig_pte = pte_mkyoung(vmf.orig_pte); + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, haddr, ptep); + update_mmu_cache(vma, vmf.address, vmf.pte); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); -- Gitee From 9c82f25a5e4ae7156171e499a656107f502d3cbd Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 26 Feb 2025 14:30:45 +0800 Subject: [PATCH 07/19] hugetlb: convert hugetlb_no_page() to use struct vm_fault mainline inclusion from mainline-v6.9-rc6 commit 7b6ec181de372a243e3ef285ae1a48f32d5c71ba category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7b6ec181de372a243e3ef285ae1a48f32d5c71ba -------------------------------- hugetlb_no_page() can use the struct vm_fault passed in from hugetlb_fault(). This alleviates the stack by consolidating 7 variables into a single struct. [vishal.moola@gmail.com: simplify hugetlb_no_page() arguments] Link: https://lkml.kernel.org/r/ZhQtN8y5zud8iI1u@fedora Link: https://lkml.kernel.org/r/20240401202651.31440-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Oscar Salvador Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 63 ++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d99b4937fbc0..560ce8b6b731 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6044,23 +6044,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, return same; } -static vm_fault_t hugetlb_no_page(struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, - pte_t old_pte, unsigned int flags, +static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; - spinlock_t *ptl; - unsigned long haddr = address & huge_page_mask(h); bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, idx); + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6079,10 +6075,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_hugetlb_folio(h, mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) + if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { @@ -6103,7 +6099,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } @@ -6118,7 +6114,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; } - folio = alloc_hugetlb_folio(vma, haddr, 0); + folio = alloc_hugetlb_folio(vma, vmf->address, 0); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6132,18 +6128,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - if (hugetlb_pte_stable(h, mm, ptep, old_pte)) + if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + clear_huge_page(&folio->page, vmf->real_address, + pages_per_huge_page(h)); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(folio, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, + vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone @@ -6152,7 +6150,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, + folio); folio_put(folio); ret = VM_FAULT_SIGBUS; goto out; @@ -6179,7 +6178,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } @@ -6194,23 +6193,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf->address); } - ptl = huge_pte_lock(h, mm, ptep); + vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ - if (!pte_same(huge_ptep_get(ptep), old_pte)) + if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) - hugetlb_add_new_anon_rmap(folio, vma, haddr); + hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) @@ -6219,17 +6218,18 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(old_pte))) + if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); - set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf); + ret = hugetlb_wp(mm, vma, vmf->real_address, vmf->pte, + vmf->flags, folio, vmf->ptl, vmf); } - spin_unlock(ptl); + spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages @@ -6246,10 +6246,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, return ret; backout: - spin_unlock(ptl); + spin_unlock(vmf->ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); @@ -6345,8 +6345,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, - vmf.pte, vmf.orig_pte, flags, &vmf); + return hugetlb_no_page(mapping, &vmf); } ret = 0; -- Gitee From 32148293512b7b1d0d2288819fe0fdf93f3adc21 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 26 Feb 2025 14:53:35 +0800 Subject: [PATCH 08/19] hugetlb: convert hugetlb_wp() to use struct vm_fault mainline inclusion from mainline-v6.9-rc5 commit bd722058e34de4857bc554e786b7f41c747ad894 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bd722058e34de4857bc554e786b7f41c747ad894 -------------------------------- hugetlb_wp() can use the struct vm_fault passed in from hugetlb_fault(). This alleviates the stack by consolidating 5 variables into a single struct. [vishal.moola@gmail.com: simplify hugetlb_wp() arguments] Link: https://lkml.kernel.org/r/ZhQtoFNZBNwBCeXn@fedora Link: https://lkml.kernel.org/r/20240401202651.31440-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Oscar Salvador Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 64 ++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 560ce8b6b731..9bec8325be28 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5774,19 +5774,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags, - struct folio *pagecache_folio, spinlock_t *ptl, +static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct vm_fault *vmf) { - const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte = huge_ptep_get(ptep); + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + pte_t pte = huge_ptep_get(vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; - unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; /* @@ -5809,7 +5808,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } @@ -5828,7 +5827,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5855,8 +5854,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ - spin_unlock(ptl); - new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); + spin_unlock(vmf->ptl); + new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); if (IS_ERR(new_folio)) { /* @@ -5881,19 +5880,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * * Reacquire both after unmap operation. */ - idx = vma_hugecache_offset(h, vma, haddr); + idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, haddr); + unmap_ref_private(mm, vma, &old_folio->page, + vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, + huge_page_size(h)); + if (likely(vmf->pte && + pte_same(huge_ptep_get(vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -5915,37 +5916,38 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(ret)) goto out_release_all; - if (copy_user_large_folio(new_folio, old_folio, address, vma)) { + if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { ret = VM_FAULT_HWPOISON_LARGE; goto out_release_all; } __folio_mark_uptodate(new_folio); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, - haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, + vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); + if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ - huge_ptep_clear_flush(vma, haddr, ptep); + huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); - hugetlb_add_new_anon_rmap(new_folio, vma, haddr); + hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); - set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, + huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } - spin_unlock(ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* @@ -5953,12 +5955,12 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * unshare) */ if (new_folio != old_folio) - restore_reserve_on_error(h, vma, haddr, new_folio); + restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); - spin_lock(ptl); /* Caller expects lock to be held */ + spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; @@ -6225,8 +6227,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, vmf->real_address, vmf->pte, - vmf->flags, folio, vmf->ptl, vmf); + ret = hugetlb_wp(folio, vmf); } spin_unlock(vmf->ptl); @@ -6432,8 +6433,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) { - ret = hugetlb_wp(mm, vma, address, vmf.pte, flags, - pagecache_folio, vmf.ptl, &vmf); + ret = hugetlb_wp(pagecache_folio, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); -- Gitee From c986069a4a572a44f9a0538b8bbac38aefdd3027 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Apr 2025 14:01:55 +0800 Subject: [PATCH 09/19] mm: change vmf_anon_prepare() to __vmf_anon_prepare() mainline inclusion from mainline-v6.12-rc1 commit 2a058ab3286d6475b2082b90c2d2182d2fea4b39 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2a058ab3286d6475b2082b90c2d2182d2fea4b39 -------------------------------- Some callers of vmf_anon_prepare() may not want us to release the per-VMA lock ourselves. Rename vmf_anon_prepare() to __vmf_anon_prepare() and let the callers drop the lock when desired. Also, make vmf_anon_prepare() a wrapper that releases the per-VMA lock itself for any callers that don't care. This is in preparation to fix this bug reported by syzbot: https://lore.kernel.org/linux-mm/00000000000067c20b06219fbc26@google.com/ Link: https://lkml.kernel.org/r/20240914194243.245-1-vishal.moola@gmail.com Fixes: 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare()") Reported-by: syzbot+2dab93857ee95f2eeb08@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/00000000000067c20b06219fbc26@google.com/ Signed-off-by: Vishal Moola (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/internal.h | 11 ++++++++++- mm/memory.c | 8 +++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 46c5a8da9d72..2c2a057faff4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -325,7 +325,16 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) wake_up(wqh); } -vm_fault_t vmf_anon_prepare(struct vm_fault *vmf); +vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); +static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + vm_fault_t ret = __vmf_anon_prepare(vmf); + + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vmf->vma); + return ret; +} + vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); diff --git a/mm/memory.c b/mm/memory.c index a6d146d684e8..df623f354106 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3245,7 +3245,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) } /** - * vmf_anon_prepare - Prepare to handle an anonymous fault. + * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a @@ -3259,7 +3259,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ -vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; @@ -3267,10 +3267,8 @@ vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - if (!mmap_read_trylock(vma->vm_mm)) { - vma_end_read(vma); + if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; - } } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM; -- Gitee From 52cc178017ab4cbb65231431c1d72ffae61eb88a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Apr 2025 14:26:26 +0800 Subject: [PATCH 10/19] mm/hugetlb.c: fix UAF of vma in hugetlb fault pathway mainline inclusion from mainline-v6.12-rc1 commit 98b74bb4d7e96b4da5ef3126511febe55b76b807 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=98b74bb4d7e96b4da5ef3126511febe55b76b807 -------------------------------- Syzbot reports a UAF in hugetlb_fault(). This happens because vmf_anon_prepare() could drop the per-VMA lock and allow the current VMA to be freed before hugetlb_vma_unlock_read() is called. We can fix this by using a modified version of vmf_anon_prepare() that doesn't release the VMA lock on failure, and then release it ourselves after hugetlb_vma_unlock_read(). Link: https://lkml.kernel.org/r/20240914194243.245-2-vishal.moola@gmail.com Fixes: 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare()") Reported-by: syzbot+2dab93857ee95f2eeb08@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/00000000000067c20b06219fbc26@google.com/ Signed-off-by: Vishal Moola (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9bec8325be28..973872327d14 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5912,7 +5912,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - ret = vmf_anon_prepare(vmf); + ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out_release_all; @@ -6111,7 +6111,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, } if (!(vma->vm_flags & VM_MAYSHARE)) { - ret = vmf_anon_prepare(vmf); + ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; } @@ -6243,6 +6243,14 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() is + * the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; @@ -6456,6 +6464,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } out_mutex: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() in + * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But -- Gitee From 0fae4a02d974b79841536cea0c8678c7233825ed Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Thu, 20 Mar 2025 18:13:01 +0800 Subject: [PATCH 11/19] memcg/hugetlb: introduce memcg_accounts_hugetlb mainline inclusion from mainline-v6.12 commit 4e97d64c492e1f65b4f7d14803ed580b279aaf6f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4e97d64c492e1f65b4f7d14803ed580b279aaf6f -------------------------------- Patch series "memcg/hugetlb: Rework memcg hugetlb charging", v3. This series cleans up memcg's hugetlb charging logic by deprecating the current memcg hugetlb try-charge + {commit, cancel} logic present in alloc_hugetlb_folio. A single function mem_cgroup_charge_hugetlb takes its place instead. This makes the code more maintainable by simplifying the error path and reduces memcg's footprint in hugetlb logic. This patch introduces a few changes in the hugetlb folio allocation error path: (a) Instead of having multiple return points, we consolidate them to two: one for reaching the memcg limit or running out of memory (-ENOMEM) and one for hugetlb allocation fails / limit being reached (-ENOSPC). (b) Previously, the memcg limit was checked before the folio is acquired, meaning the hugeTLB folio isn't acquired if the limit is reached. This patch performs the charging after the folio is reached, meaning if memcg's limit is reached, the acquired folio is freed right away. This patch builds on two earlier patch series: [2] which adds memcg hugeTLB counters, and [3] which deprecates charge moving and removes the last references to mem_cgroup_cancel_charge. The request for this cleanup can be found in [2]. [1] https://lore.kernel.org/all/20231006184629.155543-1-nphamcs@gmail.com/ [2] https://lore.kernel.org/all/20241101204402.1885383-1-joshua.hahnjy@gmail.com/ [3] https://lore.kernel.org/linux-mm/20241025012304.2473312-1-shakeel.butt@linux.dev/ This patch (of 3): This patch isolates the check for whether memcg accounts hugetlb. This condition can only be true if the memcg mount option memory_hugetlb_accounting is on, which includes hugetlb usage in memory.current. Link: https://lkml.kernel.org/r/20241211203951.764733-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20241211203951.764733-2-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Muchun Song Signed-off-by: Andrew Morton Conflicts: mm/memcontrol.c [Context conflicts in mm/memcontrol.c due to miss commit 05d4532b60e3e6e2a094ec56a88d1def50bd2430] Signed-off-by: Jiaming Sun --- mm/memcontrol.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 023d85aabfe8..0c824ca812d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1641,6 +1641,18 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } +#ifdef CONFIG_HUGETLB_PAGE +static bool memcg_accounts_hugetlb(void) +{ + return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; +} +#else /* CONFIG_HUGETLB_PAGE */ +static bool memcg_accounts_hugetlb(void) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ + static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { int i; @@ -8452,8 +8464,7 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, * but do not attempt to commit charge later (or cancel on error) either. */ if (mem_cgroup_disabled() || !memcg || - !cgroup_subsys_on_dfl(memory_cgrp_subsys) || - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + !cgroup_subsys_on_dfl(memory_cgrp_subsys) || !memcg_accounts_hugetlb()) return -EOPNOTSUPP; if (try_charge(memcg, gfp, nr_pages)) -- Gitee From ca70e144d3f309c3383e60e4bd622700f147d86d Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Thu, 13 Mar 2025 15:13:10 +0800 Subject: [PATCH 12/19] memcg/hugetlb: introduce mem_cgroup_charge_hugetlb mainline inclusion from mainline-v6.12 commit 991135774c0e05a4734e6d32aa03b00355e4cac9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=991135774c0e05a4734e6d32aa03b00355e4cac9 -------------------------------- This patch introduces mem_cgroup_charge_hugetlb which combines the logic of mem_cgroup_hugetlb_try_charge / mem_cgroup_hugetlb_commit_charge and removes the need for mem_cgroup_hugetlb_cancel_charge. It also reduces the footprint of memcg in hugetlb code and consolidates all memcg related error paths into one. Link: https://lkml.kernel.org/r/20241211203951.764733-3-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton Conflicts: mm/hugetlb.c [Context conflicts in mm/hugetlb.c due to miss commit 05d4532b60e3e6e2a094ec56a88d1def50bd2430] Signed-off-by: Jiaming Sun --- include/linux/memcontrol.h | 7 +++++++ mm/hugetlb.c | 35 ++++++++++++++--------------------- mm/memcontrol.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index abe236201e68..519d1682827e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -818,6 +818,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1413,6 +1415,11 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, return 0; } +static inline int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 973872327d14..0b80ca285ce1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3145,21 +3145,13 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file)); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); + long map_chg, map_commit; long gbl_chg; - int memcg_charge_ret, ret, idx; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); /* * Examine the region/reserve map to determine if the process @@ -3167,12 +3159,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg < 0) return ERR_PTR(-ENOMEM); - } /* * Processes that did not create the mapping will have no @@ -3285,9 +3273,17 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); - mem_cgroup_put(memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ + + if (ret == -ENOMEM) { + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } return folio; @@ -3302,9 +3298,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugepage_subpool_put_pages(spool, 1, info); out_end_reservation: vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0c824ca812d3..ea46406e41a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -8473,6 +8473,40 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, return 0; } +/** + * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio + * @folio: folio being charged + * @gfp: reclaim mode + * + * This function is called when allocating a huge page folio, after the page has + * already been obtained and charged to the appropriate hugetlb cgroup + * controller (if it is enabled). + * + * Returns ENOMEM if the memcg is already full. + * Returns 0 if either the charge was successful, or if we skip the charging. + */ +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) +{ + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); + int ret = 0; + + /* + * Even memcg does not account for hugetlb, we still want to update + * system-level stats via lruvec_stat_mod_folio. Return 0, and skip + * charging the memcg. + */ + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + goto out; + + if (charge_memcg(folio, memcg, gfp)) + ret = -ENOMEM; + +out: + mem_cgroup_put(memcg); + return ret; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. -- Gitee From b4499553fbbe96a2c99b49b3b5a02e1244d67f92 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 13 Mar 2025 16:14:29 +0800 Subject: [PATCH 13/19] mm/hugetlb: fix avoid_reserve to allow taking folio from subpool mainline inclusion from mainline-v6.14-rc1 commit 58db7c5fbe7daa42098d4965133a864f98ba90ba category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=58db7c5fbe7daa42098d4965133a864f98ba90ba -------------------------------- Patch series "mm/hugetlb: Refactor hugetlb allocation resv accounting", v2. This is a follow up on Ackerley's series here as replacement: https://lore.kernel.org/r/cover.1728684491.git.ackerleytng@google.com The goal of this series is to cleanup hugetlb resv accounting, especially during folio allocation, to decouple a few things: - Hugetlb folios v.s. Hugetlbfs: IOW, the hope is in the future hugetlb folios can be allocated completely without hugetlbfs. - Decouple VMA v.s. hugetlb folio allocations: allocating a hugetlb folio should not always require a hugetlbfs VMA. For example, either it got allocated from the inode level (see hugetlbfs_fallocate() where it used a pesudo VMA for allocation), or it can be allocated by other kernel subsystems. It paves way for other users to allocate hugetlb folios out of either system reservations, or subpools (instead of hugetlbfs, as a file system). For longer term, this prepares hugetlb as a separate concept versus hugetlbfs, so that hugetlb folios can be allocated by not only hugetlbfs and other things. Tests I've done: - I had a reproducer in patch 1 for the bug I found, this will start to work after patch 1 or the whole set applied. - Hugetlb regression tests (on x86_64 2MBs), includes: - All vmtests on hugetlbfs - libhugetlbfs test suite (which may fail some tests, but no new failures will be introduced by this series, so all such failures happen before this series so shouldn't be relevant). This patch (of 7): Since commit 04f2cbe35699 ("hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed"), avoid_reserve was introduced for a special case of CoW on hugetlb private mappings, and only if the owner VMA is trying to allocate yet another hugetlb folio that is not reserved within the private vma reserved map. Later on, in commit d85f69b0b533 ("mm/hugetlb: alloc_huge_page handle areas hole punched by fallocate"), alloc_huge_page() enforced to not consume any global reservation as long as avoid_reserve=true. This operation doesn't look correct, because even if it will enforce the allocation to not use global reservation at all, it will still try to take one reservation from the spool (if the subpool existed). Then since the spool reserved pages take from global reservation, it'll also take one reservation globally. Logically it can cause global reservation to go wrong. I wrote a reproducer below, trigger this special path, and every run of such program will cause global reservation count to increment by one, until it hits the number of free pages: #define _GNU_SOURCE /* See feature_test_macros(7) */ #include #include #include #include #include #include #define MSIZE (2UL << 20) int main(int argc, char *argv[]) { const char *path; int *buf; int fd, ret; pid_t child; if (argc < 2) { printf("usage: %s \n", argv[0]); return -1; } path = argv[1]; fd = open(path, O_RDWR | O_CREAT, 0666); if (fd < 0) { perror("open failed"); return -1; } ret = fallocate(fd, 0, 0, MSIZE); if (ret != 0) { perror("fallocate"); return -1; } buf = mmap(NULL, MSIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); if (buf == MAP_FAILED) { perror("mmap() failed"); return -1; } /* Allocate a page */ *buf = 1; child = fork(); if (child == 0) { /* child doesn't need to do anything */ exit(0); } /* Trigger CoW from owner */ *buf = 2; munmap(buf, MSIZE); close(fd); unlink(path); return 0; } It can only reproduce with a sub-mount when there're reserved pages on the spool, like: # sysctl vm.nr_hugepages=128 # mkdir ./hugetlb-pool # mount -t hugetlbfs -o min_size=8M,pagesize=2M none ./hugetlb-pool Then run the reproducer on the mountpoint: # ./reproducer ./hugetlb-pool/test Fix it by taking the reservation from spool if available. In general, avoid_reserve is IMHO more about "avoid vma resv map", not spool's. I copied stable, however I have no intention for backporting if it's not a clean cherry-pick, because private hugetlb mapping, and then fork() on top is too rare to hit. Link: https://lkml.kernel.org/r/20250107204002.2683356-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20250107204002.2683356-2-peterx@redhat.com Fixes: d85f69b0b533 ("mm/hugetlb: alloc_huge_page handle areas hole punched by fallocate") Signed-off-by: Peter Xu Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Oscar Salvador Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0b80ca285ce1..fdfd68880e5d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1397,8 +1397,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve, - long chg) + unsigned long address, long chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1414,10 +1413,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) goto err; - /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && !available_huge_pages(h)) - goto err; - gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); @@ -1433,7 +1428,7 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { + if (folio && vma_has_reserves(vma, chg)) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } @@ -3173,17 +3168,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, gbl_chg = hugepage_subpool_get_pages(spool, 1, info); if (gbl_chg < 0) goto out_end_reservation; - - /* - * Even though there was no reservation in the region/reserve - * map, there could be reservations associated with the - * subpool that can be used. This would be indicated if the - * return value of hugepage_subpool_get_pages() is zero. - * However, if avoid_reserve is specified we still avoid even - * the subpool reservations. - */ - if (avoid_reserve) - gbl_chg = 1; } /* If this allocation is not consuming a reservation, charge it now. @@ -3219,7 +3203,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); -- Gitee From 13099e64e303746b9a0b00b29fafff312d22e124 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 13 Mar 2025 16:22:29 +0800 Subject: [PATCH 14/19] mm/hugetlb: stop using avoid_reserve flag in fork() mainline inclusion from mainline-v6.14-rc1 commit be8d7314b18ede1c6cdebbb982c9659020e2f1f2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be8d7314b18ede1c6cdebbb982c9659020e2f1f2 -------------------------------- When fork() and stumble on top of a dma-pinned hugetlb private page, CoW must happen during fork() to guarantee dma coherency. In this specific path, hugetlb pages need to be allocated for the child process. Stop using avoid_reserve=1 flag here: it's not required to be used here, as dest_vma (which is destined to be a MAP_PRIVATE hugetlb vma) will have no private vma resv map, and that will make sure it won't be able to use a vma reservation later. No functional change intended with this change. Said that, it's still wanted to do this, so as to reduce the usage of avoid_reserve to the only one user, which is also why this flag was introduced initially in commit 04f2cbe35699 ("hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed"). I don't see whoever else should set it at all. Further patch will clean up resv accounting based on this. Link: https://lkml.kernel.org/r/20250107204002.2683356-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Oscar Salvador Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fdfd68880e5d..d6609e03431f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5336,7 +5336,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + new_folio = alloc_hugetlb_folio(dst_vma, addr, 0); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); -- Gitee From d8f627e0f28b8d526f1c0482398c81b11c8f4367 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 13 Mar 2025 16:49:54 +0800 Subject: [PATCH 15/19] mm/hugetlb: rename avoid_reserve to cow_from_owner mainline inclusion from mainline-v6.14-rc1 commit 30cef82bc6e8975a360ec05b707f7fb194c875ed category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=30cef82bc6e8975a360ec05b707f7fb194c875ed -------------------------------- The old name "avoid_reserve" can be too generic and can be used wrongly in the new call sites that want to allocate a hugetlb folio. It's confusing on two things: (1) whether one can opt-in to avoid global reservation, and (2) whether it should take more than one count. In reality, this flag is only used in an extremely hacky path, in an extremely hacky way in hugetlb CoW path only, and always use with 1 saying "skip global reservation". Rename the flag to avoid future abuse of this flag, making it a boolean so as to reflect its true representation that it's not a counter. To make it even harder to abuse, add a comment above the function to explain it. Link: https://lkml.kernel.org/r/20250107204002.2683356-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Oscar Salvador Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton Conflicts: fs/hugetlbfs/inode.c include/linux/hugetlb.h mm/hugetlb.c [Context conflicts in fs/hugetlbfs/inode.c due to miss commit 10969b5571387047cd461a9c701b8b9cd007f6c7] [Context conflicts in include/linux/hugetlb.h due to miss commit 04f13d241b8b146b23038bffd907cb8278391d07] [Context conflicts in mm/hugetlb.c due to miss commit 04f13d241b8b146b23038bffd907cb8278391d07] [Context conflicts in mm/hugetlb.c due to hulk commit d5ea6f5f46c3831780a3bc815566a4238f13bb01] [Context conflicts in mm/hugetlb.c due to hulk commit 8ce9d44df8ec1c75e96b296f3c6dcbcd47d99188] [Context conflicts in include/linux/hugetlb.h due to miss commit 42d0c3fbb5811fbfb663d8ede1d7ffba02e7ae18] Signed-off-by: Jiaming Sun --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 35 +++++++++++++++++++++-------------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fb0d3db162c2..66a3b6dc96eb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -915,7 +915,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * to keep reservation accounting consistent. */ hugetlb_set_vma_policy(&pseudo_vma, inode, index); - folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 63d225523ecf..61ccbc81b341 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -798,7 +798,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); + unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, @@ -1118,7 +1118,7 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, - int avoid_reserve) + bool cow_from_owner) { return NULL; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d6609e03431f..92d785e7c6b1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3133,8 +3133,15 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +/* + * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW + * faults of hugetlb private mappings on top of a non-page-cache folio (in + * which case even if there's a private vma resv map it won't cover such + * allocation). New call sites should (probably) never set it to true!! + * When it's set, the allocation will bypass all vma level reservations. + */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) + unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file)); @@ -3164,7 +3171,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * Allocations for MAP_NORESERVE mappings also need to be * checked against any subpool limit. */ - if (map_chg || avoid_reserve) { + if (map_chg || cow_from_owner) { gbl_chg = hugepage_subpool_get_pages(spool, 1, info); if (gbl_chg < 0) goto out_end_reservation; @@ -3172,7 +3179,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, /* If this allocation is not consuming a reservation, charge it now. */ - deferred_reserve = map_chg || avoid_reserve; + deferred_reserve = map_chg || cow_from_owner; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); @@ -3187,7 +3194,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (file_in_dynamic_pool(info)) { bool reserved = false; - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) + if (!cow_from_owner && vma_has_reserves(vma, gbl_chg)) reserved = true; folio = dynamic_pool_alloc_hugepage(info, h, reserved); if (!folio) @@ -3210,7 +3217,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { + if (!cow_from_owner && vma_has_reserves(vma, gbl_chg)) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } @@ -3278,7 +3285,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || avoid_reserve) + if (map_chg || cow_from_owner) hugepage_subpool_put_pages(spool, 1, info); out_end_reservation: vma_end_reservation(h, vma, addr); @@ -5336,7 +5343,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 0); + new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); @@ -5761,7 +5768,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; - int outside_reserve = 0; + bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; @@ -5823,7 +5830,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) - outside_reserve = 1; + cow_from_owner = true; folio_get(old_folio); @@ -5832,7 +5839,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); - new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* @@ -5842,7 +5849,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ - if (outside_reserve) { + if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; @@ -6093,7 +6100,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, goto out; } - folio = alloc_hugetlb_folio(vma, vmf->address, 0); + folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6525,7 +6532,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; @@ -6567,7 +6574,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; -- Gitee From 0465768ed148164087a98e33da5bdc3330a46b31 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Mar 2025 10:40:12 +0800 Subject: [PATCH 16/19] mm/hugetlb: clean up map/global resv accounting when allocate mainline inclusion from mainline-v6.14-rc1 commit 923682a0dd57065aef21bc297bfa6a9101c5da83 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=923682a0dd57065aef21bc297bfa6a9101c5da83 -------------------------------- alloc_hugetlb_folio() isn't a function easy to read, especially on reservation accountings for either VMA or globally (majorly, spool only). The 1st complexity lies in the special private CoW path, aka, cow_from_owner=true case. The 2nd complexity may be the confusing updates of gbl_chg after it's set once, which looks like they can change anytime on the fly. Logically, cow_from_user is only about vma reservation. We could already decouple the flag and consolidate it into map charge flag very early. Then we don't need to keep checking the CoW special flag every time. This patch does it by making map_chg a tri-state flag. Tri-state needed is unfortunate, and it's because currently vma_needs_reservation() has a side effect internally, that it must be followed by either a end() or commit(). We keep the same semantic as before on one thing: "if (map_chg)" means we need a separate per-vma resv count. It keeps most of the old code like before untouched with the new enum. After this patch, we take these steps to decide these variables, hopefully slightly easier to follow: - First, decide map_chg. This will take cow_from_owner into account, once and for all. It's about whether we could take a resv count from the vma, no matter it's shared, private, etc. - Then, decide gbl_chg. The only diff here is spool, comparing to map_chg. Now only update each flag once and for all, instead of keep any of them flipping which can be very hard to follow. With cow_from_owner merged into map_chg, we could remove quite a few such checks all over. Side benefit of such is that we can get rid of one more confusing flag, which is deferred_reserve. Cleanup the comments a bit too. E.g., MAP_NORESERVE may not need to check against spool limit, AFAIU, if it's on a shared mapping, and if the page cache folio has its inode's resv map available (in which case map_chg would have been set zero, hence the code should be correct, not the comment). There's one trivial detail that needs attention that this patch touched, which is this check right after vma_commit_reservation(): if (map_chg > map_commit) It changes to: if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) It should behave the same like before, because previously the only way to make "map_chg > map_commit" happen is map_chg=1 && map_commit=0. That's exactly the rewritten line. Meanwhile, either commit() or end() will need to be skipped if ENFORCE, to keep the old behavior. Even though it looks a lot changed, but no functional change expected. Link: https://lkml.kernel.org/r/20250107204002.2683356-5-peterx@redhat.com Signed-off-by: Peter Xu Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton Conflicts: mm/hugetlb.c [Context conflicts in mm/hugetlb.c due to hulk commit d5ea6f5f46c3831780a3bc815566a4238f13bb01] [Context conflicts in mm/hugetlb.c due to hulk commit 8110c42f7fc1b39345ad0f1760da75f2bb1028c5] [Context conflicts in mm/hugetlb.c due to hulk commit 8ce9d44df8ec1c75e96b296f3c6dcbcd47d99188] [Context conflicts in mm/hugetlb.c due to miss commit 04f13d241b8b146b23038bffd907cb8278391d07] Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 112 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 34 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92d785e7c6b1..e292073f7823 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3133,6 +3133,25 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +typedef enum { + /* + * For either 0/1: we checked the per-vma resv map, and one resv + * count either can be reused (0), or an extra needed (1). + */ + MAP_CHG_REUSE = 0, + MAP_CHG_NEEDED = 1, + /* + * Cannot use per-vma resv count can be used, hence a new resv + * count is enforced. + * + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except + * that currently vma_needs_reservation() has an unwanted side + * effect to either use end() or commit() to complete the + * transaction. Hence it needs to differenciate from NEEDED. + */ + MAP_CHG_ENFORCED = 2, +} map_chg_state; + /* * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW * faults of hugetlb private mappings on top of a non-page-cache folio (in @@ -3147,40 +3166,59 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file)); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit; - long gbl_chg; + long retval, gbl_chg; + map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; idx = hstate_index(h); - /* - * Examine the region/reserve map to determine if the process - * has a reservation for the page to be allocated. A return - * code of zero indicates a reservation exists (no change). - */ - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) - return ERR_PTR(-ENOMEM); + + /* Whether we need a separate per-vma reservation? */ + if (cow_from_owner) { + /* + * Special case! Since it's a CoW on top of a reserved + * page, the private resv map doesn't count. So it cannot + * consume the per-vma resv map even if it's reserved. + */ + map_chg = MAP_CHG_ENFORCED; + } else { + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + retval = vma_needs_reservation(h, vma, addr); + if (retval < 0) + return ERR_PTR(-ENOMEM); + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; + } /* + * Whether we need a separate global reservation? + * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. - * Allocations for MAP_NORESERVE mappings also need to be - * checked against any subpool limit. + * Or if it can get one from the pool reservation directly. */ - if (map_chg || cow_from_owner) { + if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1, info); if (gbl_chg < 0) goto out_end_reservation; + } else { + /* + * If we have the vma reservation ready, no need for extra + * global reservation. + */ + gbl_chg = 0; } - /* If this allocation is not consuming a reservation, charge it now. + /* + * If this allocation is not consuming a per-vma reservation, + * charge the hugetlb cgroup now. */ - deferred_reserve = map_chg || cow_from_owner; - if (deferred_reserve) { + if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -3194,7 +3232,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (file_in_dynamic_pool(info)) { bool reserved = false; - if (!cow_from_owner && vma_has_reserves(vma, gbl_chg)) + if (vma_has_reserves(vma, gbl_chg)) reserved = true; folio = dynamic_pool_alloc_hugepage(info, h, reserved); if (!folio) @@ -3217,7 +3255,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!cow_from_owner && vma_has_reserves(vma, gbl_chg)) { + if (vma_has_reserves(vma, gbl_chg)) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } @@ -3231,7 +3269,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ - if (deferred_reserve) { + if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } @@ -3241,26 +3279,31 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!page_from_dynamic_pool(folio_page(folio, 0))) hugetlb_set_folio_subpool(folio, spool); - map_commit = vma_commit_reservation(h, vma, addr); - if (unlikely(map_chg > map_commit)) { + if (map_chg != MAP_CHG_ENFORCED) { + /* commit() is only needed if the map_chg is not enforced */ + retval = vma_commit_reservation(h, vma, addr); /* + * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND - * in hugetlb_reserve_pages for the same page. Also, + * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ - long rsv_adjust; + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { + long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1, info); - hugetlb_acct_memory(h, -rsv_adjust, info); - if (deferred_reserve) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), - pages_per_huge_page(h), folio); - spin_unlock_irq(&hugetlb_lock); + rsv_adjust = hugepage_subpool_put_pages(spool, 1, info); + hugetlb_acct_memory(h, -rsv_adjust, info); + if (map_chg) { + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), + folio); + spin_unlock_irq(&hugetlb_lock); + } } } @@ -3281,14 +3324,15 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: - if (deferred_reserve) + if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || cow_from_owner) + if (map_chg) hugepage_subpool_put_pages(spool, 1, info); out_end_reservation: - vma_end_reservation(h, vma, addr); + if (map_chg != MAP_CHG_ENFORCED) + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } -- Gitee From 9bbe4542563f65ab0d33a64e40def96238083e22 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Mar 2025 11:37:42 +0800 Subject: [PATCH 17/19] mm/hugetlb: simplify vma_has_reserves() mainline inclusion from mainline-v6.14-rc1 commit 51e1de00acdb75585c46654147c7c7eb0689a068 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=51e1de00acdb75585c46654147c7c7eb0689a068 -------------------------------- vma_has_reserves() is a helper "trying" to know whether the vma should consume one reservation when allocating the hugetlb folio. However it's not clear on why we need such complexity, as such information is already represented in the "chg" variable. From alloc_hugetlb_folio() context, "chg" (or in the function's context, "gbl_chg") is defined as: - If gbl_chg=1, the allocation cannot reuse an existing reservation - If gbl_chg=0, the allocation should reuse an existing reservation Firstly, map_chg is defined as following, to cover all cases of hugetlb reservation scenarios (mostly, via vma_needs_reservation(), but cow_from_owner is an outlier): CONDITION HAS RESERVATION? ========= ================ - SHARED: always check against per-inode resv_map (ignore NONRESERVE) - If resv exists ==> YES [1] - If not ==> NO [2] - PRIVATE: complicated... - Request came from a CoW from owner resv map ==> NO [3] (when cow_from_owner==true) - If does not own a resv_map at all.. ==> NO [4] (examples: VM_NORESERVE, private fork()) - If owns a resv_map, but resv donsn't exists ==> NO [5] - If owns a resv_map, and resv exists ==> YES [6] Further on, gbl_chg considered spool setup, so that is a decision based on all the context. If we look at vma_has_reserves(), it almost does check that has already been processed by map_chg accounting (I marked each return value to the case above): static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { if (vma->vm_flags & VM_NORESERVE) { if (vma->vm_flags & VM_MAYSHARE && chg == 0) return true; ==> [1] else return false; ==> [2] or [4] } if (vma->vm_flags & VM_MAYSHARE) { if (chg) return false; ==> [2] else return true; ==> [1] } if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { if (chg) return false; ==> [5] else return true; ==> [6] } return false; ==> [4] } It didn't check [3], but [3] case was actually already covered now by the "chg" / "gbl_chg" / "map_chg" calculations. In short, vma_has_reserves() doesn't provide anything more than return "!chg".. so just simplify all the things. There're a lot of comments describing truncation races, IIUC there should have no race as long as map_chg is properly done. Link: https://lkml.kernel.org/r/20250107204002.2683356-6-peterx@redhat.com Signed-off-by: Peter Xu Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton Conflicts: mm/hugetlb.c [Context conflicts in mm/hugetlb.c due to hulk commit 4bd64dfd060365b770e343e63a184c9db2ea3c88] [Context conflicts in mm/hugetlb.c due to hulk commit 8ce9d44df8ec1c75e96b296f3c6dcbcd47d99188] Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 69 ++++++---------------------------------------------- 1 file changed, 8 insertions(+), 61 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e292073f7823..903887e835d9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1255,66 +1255,13 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(struct vm_area_struct *vma, long chg) +static bool vma_has_reserves(long chg) { - if (vma->vm_flags & VM_NORESERVE) { - /* - * This address is already reserved by other process(chg == 0), - * so, we should decrement reserved count. Without decrementing, - * reserve count remains after releasing inode, because this - * allocated page will go into page cache and is regarded as - * coming from reserved pool in releasing step. Currently, we - * don't have any other solution to deal with this situation - * properly, so add work-around here. - */ - if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return true; - else - return false; - } - - /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) { - /* - * We know VM_NORESERVE is not set. Therefore, there SHOULD - * be a region map for all pages. The only situation where - * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reserves to - * use. This situation is indicated if chg != 0. - */ - if (chg) - return false; - else - return true; - } - /* - * Only the process that called mmap() has reserves for - * private mappings. + * Now "chg" has all the conditions considered for whether we + * should use an existing reservation. */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Like the shared case above, a hole punch or truncate - * could have been performed on the private mapping. - * Examine the value of chg to determine if reserves - * actually exist or were previously consumed. - * Very Subtle - The value of chg comes from a previous - * call to vma_needs_reserves(). The reserve map for - * private mappings has different (opposite) semantics - * than that of shared mappings. vma_needs_reserves() - * has already taken this difference in semantics into - * account. Therefore, the meaning of chg is the same - * as in the shared case above. Code could easily be - * combined, but keeping it separate draws attention to - * subtle differences. - */ - if (chg) - return false; - else - return true; - } - - return false; + return chg == 0; } void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) @@ -1410,7 +1357,7 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) + if (!vma_has_reserves(chg) && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1428,7 +1375,7 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && vma_has_reserves(vma, chg)) { + if (folio && vma_has_reserves(chg)) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } @@ -3232,7 +3179,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (file_in_dynamic_pool(info)) { bool reserved = false; - if (vma_has_reserves(vma, gbl_chg)) + if (vma_has_reserves(gbl_chg)) reserved = true; folio = dynamic_pool_alloc_hugepage(info, h, reserved); if (!folio) @@ -3255,7 +3202,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (vma_has_reserves(vma, gbl_chg)) { + if (vma_has_reserves(gbl_chg)) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } -- Gitee From 198736e74462155b592742fb69592048a4d1cadc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Mar 2025 13:40:06 +0800 Subject: [PATCH 18/19] mm/hugetlb: drop vma_has_reserves() mainline inclusion from mainline-v6.14-rc1 commit 72d8f72631d278c7003defbe3fe88fe5332822f8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=72d8f72631d278c7003defbe3fe88fe5332822f8 -------------------------------- After the previous cleanup, vma_has_reserves() is mostly an empty helper except that it says "use reserve count" is inverted meaning from "needs a global reserve count", which is still true. To avoid confusions on having two inverted ways to ask the same question, always use the gbl_chg everywhere, and drop the function. When at it, rename "chg" to "gbl_chg" in dequeue_hugetlb_folio_vma(). It might be helpful for readers to see that the "chg" here is the global reserve count, not the vma resv count. Link: https://lkml.kernel.org/r/20250107204002.2683356-7-peterx@redhat.com Signed-off-by: Peter Xu Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton Conflicts: mm/hugetlb.c [Context conflicts in mm/hugetlb.c due to hulk commit 4bd64dfd060365b770e343e63a184c9db2ea3c88] [Context conflicts in mm/hugetlb.c due to hulk commit 8ce9d44df8ec1c75e96b296f3c6dcbcd47d99188] Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 903887e835d9..5cf3860d52c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1254,16 +1254,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) hugetlb_dup_vma_private(vma); } -/* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(long chg) -{ - /* - * Now "chg" has all the conditions considered for whether we - * should use an existing reservation. - */ - return chg == 0; -} - void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1344,7 +1334,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, long chg) + unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1353,11 +1343,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, int nid; /* - * A child process with MAP_PRIVATE mappings created by their parent - * have no page reserves. This check ensures that reservations are - * not "stolen". The child may still get SIGKILLed + * gbl_chg==1 means the allocation requires a new page that was not + * reserved before. Making sure there's at least one free page. */ - if (!vma_has_reserves(chg) && !available_huge_pages(h)) + if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1375,7 +1364,7 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && vma_has_reserves(chg)) { + if (folio && !gbl_chg) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } @@ -3179,7 +3168,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (file_in_dynamic_pool(info)) { bool reserved = false; - if (vma_has_reserves(gbl_chg)) + if (!gbl_chg) reserved = true; folio = dynamic_pool_alloc_hugepage(info, h, reserved); if (!folio) @@ -3202,7 +3191,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (vma_has_reserves(gbl_chg)) { + if (!gbl_chg) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } -- Gitee From 4f828c75a466a0a0fc4b2c093ab52413efe231bf Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 17 Mar 2025 13:53:13 +0800 Subject: [PATCH 19/19] mm/hugetlb: unify restore reserve accounting for new allocations mainline inclusion from mainline-v6.14-rc1 commit f931af2e41ab406e7fc3063ba5a73b536779006e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBMW50 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f931af2e41ab406e7fc3063ba5a73b536779006e -------------------------------- Either hugetlb pages dequeued from hstate, or newly allocated from buddy, would require restore-reserve accounting to be managed properly. Merge the two paths on it. Add a small comment to make it slightly nicer. Link: https://lkml.kernel.org/r/20250107204002.2683356-8-peterx@redhat.com Signed-off-by: Peter Xu Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton Conflicts: mm/hugetlb.c [Context conflicts in mm/hugetlb.c due to hulk commit 8ce9d44df8ec1c75e96b296f3c6dcbcd47d99188] Signed-off-by: Jiaming Sun --- mm/hugetlb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5cf3860d52c7..524f6d737d60 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1364,11 +1364,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !gbl_chg) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - mpol_cond_put(mpol); return folio; @@ -3191,15 +3186,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!gbl_chg) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + /* + * Either dequeued or buddy-allocated folio needs to add special + * mark to the folio when it consumes a global reservation. + */ + if (!gbl_chg) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + out: hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the -- Gitee