From e48a24d2f2f10a4b680fbb407f1b26ab41844de0 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 14 Aug 2025 13:14:45 +0800 Subject: [PATCH 1/3] mm: call pointers to ptes as ptep mainline inclusion from mainline-v6.17-rc1 commit 94dab12d86cf77ff0b8f667dc98af6d997422cb4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC8PQW Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=94dab12d86cf77ff0b8f667dc98af6d997422cb4 -------------------------------- Patch series "Optimize mremap() for large folios", v4. Currently move_ptes() iterates through ptes one by one. If the underlying folio mapped by the ptes is large, we can process those ptes in a batch using folio_pte_batch(), thus clearing and setting the PTEs in one go. For arm64 specifically, this results in a 16x reduction in the number of ptep_get() calls (since on a contig block, ptep_get() on arm64 will iterate through all 16 entries to collect a/d bits), and we also elide extra TLBIs through get_and_clear_full_ptes, replacing ptep_get_and_clear. Mapping 1M of memory with 64K folios, memsetting it, remapping it to src + 1M, and munmapping it 10,000 times, the average execution time reduces from 1.9 to 1.2 seconds, giving a 37% performance optimization, on Apple M3 (arm64). No regression is observed for small folios. Test program for reference: int main(void) { void *new_addr, *addr; for (int i = 0; i < 10000; ++i) { addr = mmap((void *)(1UL << 30), SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); return 1; } memset(addr, 0xAA, SIZE); new_addr = mremap(addr, SIZE, SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, addr + SIZE); if (new_addr != (addr + SIZE)) { perror("mremap"); return 1; } munmap(new_addr, SIZE); } } This patch (of 2): Avoid confusion between pte_t* and pte_t data types by suffixing pointer type variables with p. No functional change. Link: https://lkml.kernel.org/r/20250610035043.75448-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20250610035043.75448-2-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Barry Song Reviewed-by: Anshuman Khandual Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Pedro Falcato Cc: Bang Li Cc: Baolin Wang Cc: bibo mao Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Lance Yang Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Qi Zheng Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Conflicts: mm/mremap.c [Context conflicts in mremap.c due to miss commit 838d02354464 and b36b701bbcd9 and 0cef0bb836e3] Signed-off-by: Wang Lian --- mm/mremap.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e990bb8c8918..60626189199f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -141,7 +141,8 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long new_addr, bool need_rmap_locks) { struct mm_struct *mm = vma->vm_mm; - pte_t *old_pte, *new_pte, pte; + pte_t *old_ptep, *new_ptep; + pte_t pte; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; unsigned long len = old_end - old_addr; @@ -172,14 +173,14 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * We don't have to worry about the ordering of src and dst * pte locks because exclusive mmap_lock prevents deadlock. */ - old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - if (!old_pte) { + old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + if (!old_ptep) { err = -EAGAIN; goto out; } - new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl); - if (!new_pte) { - pte_unmap_unlock(old_pte, old_ptl); + new_ptep = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl); + if (!new_ptep) { + pte_unmap_unlock(old_ptep, old_ptl); err = -EAGAIN; goto out; } @@ -188,12 +189,12 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); - for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, - new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(ptep_get(old_pte))) + for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE, + new_ptep++, new_addr += PAGE_SIZE) { + if (pte_none(ptep_get(old_ptep))) continue; - pte = ptep_get_and_clear(mm, old_addr, old_pte); + pte = ptep_get_and_clear(mm, old_addr, old_ptep); /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the @@ -209,7 +210,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at(mm, new_addr, new_ptep, pte); } arch_leave_lazy_mmu_mode(); @@ -217,8 +218,8 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap(new_pte - 1); - pte_unmap_unlock(old_pte - 1, old_ptl); + pte_unmap(new_ptep - 1); + pte_unmap_unlock(old_ptep - 1, old_ptl); out: if (need_rmap_locks) drop_rmap_locks(vma); -- Gitee From e10dfc94ed9ff89fa604c822b377ca0a416ff7eb Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 14 Aug 2025 14:12:19 +0800 Subject: [PATCH 2/3] mm: optimize mremap() by PTE batching mainline inclusion from mainline-v6.17-rc1 commit f822a9a81a31311d67f260aea96005540b18ab07 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC8PQW Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f822a9a81a31311d67f260aea96005540b18ab07 -------------------------------- Use folio_pte_batch() to optimize move_ptes(). On arm64, if the ptes are painted with the contig bit, then ptep_get() will iterate through all 16 entries to collect a/d bits. Hence this optimization will result in a 16x reduction in the number of ptep_get() calls. Next, ptep_get_and_clear() will eventually call contpte_try_unfold() on every contig block, thus flushing the TLB for the complete large folio range. Instead, use get_and_clear_full_ptes() so as to elide TLBIs on each contig block, and only do them on the starting and ending contig block. For split folios, there will be no pte batching; nr_ptes will be 1. For pagetable splitting, the ptes will still point to the same large folio; for arm64, this results in the optimization described above, and for other arches (including the general case), a minor improvement is expected due to a reduction in the number of function calls. Link: https://lkml.kernel.org/r/20250610035043.75448-3-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Cc: Anshuman Khandual Cc: Bang Li Cc: Baolin Wang Cc: bibo mao Cc: David Hildenbrand Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Lance Yang Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Qi Zheng Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/mremap.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 60626189199f..d959bab9ca83 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -135,6 +135,23 @@ static pte_t move_soft_dirty_pte(pte_t pte) return pte; } +static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t pte, int max_nr) +{ + const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + struct folio *folio; + + if (max_nr == 1) + return 1; + + folio = vm_normal_folio(vma, addr, pte); + if (!folio || !folio_test_large(folio)) + return 1; + + return folio_pte_batch(folio, addr, ptep, pte, max_nr, flags, NULL, + NULL, NULL); +} + static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, @@ -142,10 +159,12 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, { struct mm_struct *mm = vma->vm_mm; pte_t *old_ptep, *new_ptep; - pte_t pte; + pte_t old_pte, pte; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; unsigned long len = old_end - old_addr; + int max_nr_ptes; + int nr_ptes; int err = 0; /* @@ -189,12 +208,14 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); - for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE, - new_ptep++, new_addr += PAGE_SIZE) { - if (pte_none(ptep_get(old_ptep))) + for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, + new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { + nr_ptes = 1; + max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT; + old_pte = ptep_get(old_ptep); + if (pte_none(old_pte)) continue; - pte = ptep_get_and_clear(mm, old_addr, old_ptep); /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the @@ -206,11 +227,15 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * the TLB entry for the old mapping has been * flushed. */ - if (pte_present(pte)) + if (pte_present(old_pte)) { + nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep, + old_pte, max_nr_ptes); force_flush = true; + } + pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - set_pte_at(mm, new_addr, new_ptep, pte); + set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); } arch_leave_lazy_mmu_mode(); -- Gitee From 4347e8a8360883438794eb58f297086bc69a0396 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 15 Aug 2025 08:58:11 +0800 Subject: [PATCH 3/3] mm/mremap: avoid expensive folio lookup on mremap folio pte batch mainline inclusion from mainline-v6.17-rc1 commit 0b5be138ce00f421bd7cc5a226061bd62c4ab850 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IC8PQW Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0b5be138ce00f421bd7cc5a226061bd62c4ab850 -------------------------------- It was discovered in the attached report that commit f822a9a81a31 ("mm: optimize mremap() by PTE batching") introduced a significant performance regression on a number of metrics on x86-64, most notably stress-ng.bigheap.realloc_calls_per_sec - indicating a 37.3% regression in number of mremap() calls per second. I was able to reproduce this locally on an intel x86-64 raptor lake system, noting an average of 143,857 realloc calls/sec (with a stddev of 4,531 or 3.1%) prior to this patch being applied, and 81,503 afterwards (stddev of 2,131 or 2.6%) - a 43.3% regression. During testing I was able to determine that there was no meaningful difference in efforts to optimise the folio_pte_batch() operation, nor checking folio_test_large(). This is within expectation, as a regression this large is likely to indicate we are accessing memory that is not yet in a cache line (and perhaps may even cause a main memory fetch). The expectation by those discussing this from the start was that vm_normal_folio() (invoked by mremap_folio_pte_batch()) would likely be the culprit due to having to retrieve memory from the vmemmap (which mremap() page table moves does not otherwise do, meaning this is inevitably cold memory). I was able to definitively determine that this theory is indeed correct and the cause of the issue. The solution is to restore part of an approach previously discarded on review, that is to invoke pte_batch_hint() which explicitly determines, through reference to the PTE alone (thus no vmemmap lookup), what the PTE batch size may be. On platforms other than arm64 this is currently hardcoded to return 1, so this naturally resolves the issue for x86-64, and for arm64 introduces little to no overhead as the pte cache line will be hot. With this patch applied, we move from 81,503 realloc calls/sec to 138,701 (stddev of 496.1 or 0.4%), which is a -3.6% regression, however accounting for the variance in the original result, this is broadly restoring performance to its prior state. Link: https://lkml.kernel.org/r/20250807185819.199865-1-lorenzo.stoakes@oracle.com Fixes: f822a9a81a31 ("mm: optimize mremap() by PTE batching") Signed-off-by: Lorenzo Stoakes Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202508071609.4e743d7c-lkp@intel.com Acked-by: David Hildenbrand Acked-by: Pedro Falcato Reviewed-by: Barry Song Acked-by: Vlastimil Babka Reviewed-by: Dev Jain Cc: Ryan Roberts Cc: Barry Song Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/mremap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/mremap.c b/mm/mremap.c index d959bab9ca83..41434aeee392 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -144,6 +144,10 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (max_nr == 1) return 1; + /* Avoid expensive folio lookup if we stand no chance of benefit. */ + if (pte_batch_hint(ptep, pte) == 1) + return 1; + folio = vm_normal_folio(vma, addr, pte); if (!folio || !folio_test_large(folio)) return 1; -- Gitee