From 5d25b415da2f495913c6bfa319935cfa484fb31a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 16 Jun 2025 10:08:18 +0800 Subject: [PATCH 01/12] mm/madvise: don't perform madvise VMA walk for MADV_POPULATE_(READ|WRITE) mainline inclusion from mainline-v6.9-rc1 commit fa9fcd8bb6e4901e5ed0c7972547403d4c8dfec8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fa9fcd8bb6e4901e5ed0c7972547403d4c8dfec8 -------------------------------- We changed faultin_page_range() to no longer consume a VMA, because faultin_page_range() might internally release the mm lock to lookup the VMA again -- required to cleanly handle VM_FAULT_RETRY. But independent of that, __get_user_pages() will always lookup the VMA itself. Now that we let __get_user_pages() just handle VMA checks in a way that is suitable for MADV_POPULATE_(READ|WRITE), the VMA walk in madvise() is just overhead. So let's just call madvise_populate() on the full range instead. There is one change in behavior: madvise_walk_vmas() would skip any VMA holes, and if everything succeeded, it would return -ENOMEM after processing all VMAs. However, for MADV_POPULATE_(READ|WRITE) it's unlikely for the caller to notice any difference: -ENOMEM might either indicate that there were VMA holes or that populating page tables failed because there was not enough memory. So it's unlikely that user space will notice the difference, and that special handling likely only makes sense for some other madvise() actions. Further, we'd already fail with -ENOMEM early in the past if looking up the VMA after dropping the MM lock failed because of concurrent VMA modifications. So let's just keep it simple and avoid the madvise VMA walk, and consistently fail early if we find a VMA hole. Link: https://lkml.kernel.org/r/20240314161300.382526-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Darrick J. Wong Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 0ad321350cfe..4f28a88fb466 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -982,26 +982,19 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, return -EINVAL; } -static long madvise_populate(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - int behavior) +static long madvise_populate(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; - struct mm_struct *mm = vma->vm_mm; int locked = 1; long pages; - *prev = vma; - while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; - *prev = NULL; - vma = NULL; } if (pages < 0) { switch (pages) { @@ -1102,9 +1095,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1522,8 +1512,16 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh end = start + len; blk_start_plug(&plug); - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + error = madvise_populate(mm, start, end, behavior); + break; + default: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); + break; + } blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); -- Gitee From 04141cb13dead63574151b90b75414fb884465da Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 09:21:30 +0800 Subject: [PATCH 02/12] mm/madvise: unrestrict process_madvise() for current process mainline inclusion from mainline-v6.10-rc1 commit 021781b01275c07cd5b7d3e4e8afc2bdf2429a84 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=021781b01275c07cd5b7d3e4e8afc2bdf2429a84 -------------------------------- The process_madvise() call was introduced in commit ecb8ac8b1f14 ("mm/madvise: introduce process_madvise() syscall: an external memory hinting API") as a means of performing madvise() operations on another process. However, as it provides the means by which to perform multiple madvise() operations in a batch via an iovec, it is useful to utilise the same interface for performing operations on the current process rather than a remote one. Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check if same mm") removed the need for a caller invoking process_madvise() on its own pidfd to possess the CAP_SYS_NICE capability, however this leaves the restrictions on operation in place. Resolve this by only applying the restriction on operations when accessing a remote process. Moving forward we plan to implement a simpler means of specifying this condition other than needing to establish a self pidfd, perhaps in the form of a sentinel pidfd. Also take the opportunity to refactor the system call implementation abstracting the vectorised operation. Link: https://lkml.kernel.org/r/20240926151019.82902-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Arnd Bergmann Cc: Christian Brauner Cc: "Liam R. Howlett" Cc: Minchan Kim Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 55 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 4f28a88fb466..5d4c194a5e0c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1265,7 +1265,8 @@ madvise_behavior_valid(int behavior) } } -static bool process_madvise_behavior_valid(int behavior) +/* Can we invoke process_madvise() on a remote mm for the specified behavior? */ +static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: @@ -1536,6 +1537,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return do_madvise(current->mm, start, len_in, behavior); } +/* Perform an madvise operation over a vector of addresses and lengths. */ +static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, + int behavior) +{ + ssize_t ret = 0; + size_t total_len; + + total_len = iov_iter_count(iter); + + while (iov_iter_count(iter)) { + ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), + iter_iov_len(iter), behavior); + if (ret < 0) + break; + iov_iter_advance(iter, iter_iov_len(iter)); + } + + ret = (total_len - iov_iter_count(iter)) ? : ret; + + return ret; +} + SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { @@ -1545,7 +1568,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; - size_t total_len; unsigned int f_flags; if (flags != 0) { @@ -1563,11 +1585,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { - ret = -EINVAL; - goto release_task; - } - /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) { @@ -1575,26 +1592,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + /* + * We need only perform this check if we are attempting to manipulate a + * remote process's address space. + */ + if (mm != current->mm && !process_madvise_remote_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * Require CAP_SYS_NICE for influencing process performance. Note that - * only non-destructive hints are currently supported. + * only non-destructive hints are currently supported for remote + * processes. */ if (!capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } - total_len = iov_iter_count(&iter); - - while (iov_iter_count(&iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), - iter_iov_len(&iter), behavior); - if (ret < 0) - break; - iov_iter_advance(&iter, iter_iov_len(&iter)); - } - - ret = (total_len - iov_iter_count(&iter)) ? : ret; + ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); -- Gitee From a1752923bdafdb4affbb1e7f9b604f6e0aace3ec Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 10:18:54 +0800 Subject: [PATCH 03/12] mm/madvise: split out mmap locking operations for madvise() mainline inclusion from mainline-v6.15-rc1 commit 4cc39f91ef6c6f876651eb231974a59ffbcb3a21 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cc39f91ef6c6f876651eb231974a59ffbcb3a21 -------------------------------- Patch series "mm/madvise: remove redundant mmap_lock operations from process_madvise()". process_madvise() calls do_madvise() for each address range. Then, each do_madvise() invocation holds and releases same mmap_lock. Optimize the redundant lock operations by splitting do_madvise() internal logic including the mmap_lock operations, and calling the small logic directly from process_madvise() in a sequence that removes the redundant locking. As a result of this change, process_madvise() becomes more efficient and less racy in terms of its results and latency. Note that the potential downside of this series is that other mmap_lock holders may take more time due to the increased length of mmap_lock critical section for process_madvise() calls. But there is maximum limit in the kernel space (IOV_MAX), and userspace can control the critical section length by setting the request size. Hence, the downside would be limited and controllable. Evaluation ========== I measured the time to apply MADV_DONTNEED advice to 256 MiB memory using multiple madvise() calls, 4 KiB per each call. I also do the same with process_madvise(), but with varying batch size (vlen) from 1 to 1024. The source code for the measurement is available at GitHub[1]. Because the microbenchmark result is not that stable, I ran each configuration five times and use the average. The measurement results are as below. 'sz_batches' column shows the batch size of process_madvise() calls. '0' batch size is for madvise() calls case. 'before' and 'after' columns are the measured time to apply MADV_DONTNEED to the 256 MiB memory buffer in nanoseconds, on kernels that built without and with the last patch of this series, respectively. So lower value means better efficiency. 'after/before' column is the ratio of 'after' to 'before'. sz_batches before after after/before 0 146294215.2 121280536.2 0.829017989769427 1 165851018.8 136305598.2 0.821855658085351 2 129469321.2 103740383.6 0.801273866569094 4 110369232.4 87835896.2 0.795836795182785 8 102906232.4 77420920.2 0.752344327397609 16 97551017.4 74959714.4 0.768415506038587 32 94809848.2 71200848.4 0.750985786305689 64 96087575.6 72593180 0.755489765942227 128 96154163.8 68517055.4 0.712575022154163 256 92901257.6 69054216.6 0.743307662177439 512 93646170.8 67053296.2 0.716028168874151 1024 92663219.2 70168196.8 0.75723892830177 Despite the unstable nature of the test program, the trend is as we expect. The measurement shows this patchset reduces the process_madvise() latency, proportional to the batching size. The latency gain was about 20% with the batch size 2, and it has increased to about 28% with the batch size 512, since more number of mmap locking is reduced with larger batch size. Note that the standard devitation of the measurements for each sz_batches configuration ranged from 1.9% to 7.2%. That is, this result is not very stable. The average of the standard deviations for different batch sizes were 4.62% and 4.70% for the 'before' and 'after' kernel measurements. Also note that this patch has somehow decreased latencies of madvise() and single batch size process_madvise(). Seems this code path is small enough to significantly be affected by compiler optimizations including inlining of split-out functions. Please focus on only the improvement amount that changed by the batch size. [1] https://github.com/sjp38/eval_proc_madvise This patch (of 4): Split out the madvise behavior-dependent mmap_lock operations from do_madvise(), for easier reuse of the logic in an upcoming change. [lorenzo.stoakes@oracle.com: fix madvise_[un]lock() issue] Link: https://lkml.kernel.org/r/2f448f7b-1da7-4099-aa9e-0179d47fde40@lucifer.local [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20250206061517.2958-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250206061517.2958-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Davidlohr Bueso Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: SeongJae Park Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 63 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 5d4c194a5e0c..da7e91b97428 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1395,6 +1395,50 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +#ifdef CONFIG_MEMORY_FAILURE +static bool is_memory_failure(int behavior) +{ + switch (behavior) { + case MADV_HWPOISON: + case MADV_SOFT_OFFLINE: + return true; + default: + return false; + } +} +#else +static bool is_memory_failure(int behavior) +{ + return false; +} +#endif + +static int madvise_lock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return 0; + + if (madvise_need_mmap_write(behavior)) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + } else { + mmap_read_lock(mm); + } + return 0; +} + +static void madvise_unlock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return; + + if (madvise_need_mmap_write(behavior)) + mmap_write_unlock(mm); + else + mmap_read_unlock(mm); +} + /* * The madvise(2) system call. * @@ -1474,7 +1518,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh { unsigned long end; int error; - int write; size_t len; struct blk_plug plug; @@ -1496,19 +1539,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh if (end == start) return 0; + error = madvise_lock(mm, behavior); + if (error) + return error; + #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } - start = untagged_addr_remote(mm, start); end = start + len; @@ -1524,10 +1563,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh break; } blk_finish_plug(&plug); - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); + + madvise_unlock(mm, behavior); return error; } -- Gitee From c59d66e081ed92cd750dff81cc862955d50c44c2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 10:30:45 +0800 Subject: [PATCH 04/12] mm/madvise: split out madvise input validity check mainline inclusion from mainline-v6.15-rc1 commit dbb0020bbc2c9f563d68564b36d6e8d32f82008b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dbb0020bbc2c9f563d68564b36d6e8d32f82008b -------------------------------- Split out the madvise parameters validation logic from do_madvise(), for easy reuse of the logic from a future change. Link: https://lkml.kernel.org/r/20250206061517.2958-3-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Davidlohr Bueso Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index da7e91b97428..2b5718ba3e31 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1439,6 +1439,27 @@ static void madvise_unlock(struct mm_struct *mm, int behavior) mmap_read_unlock(mm); } +static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +{ + size_t len; + + if (!madvise_behavior_valid(behavior)) + return false; + + if (!PAGE_ALIGNED(start)) + return false; + len = PAGE_ALIGN(len_in); + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return false; + + if (start + len < start) + return false; + + return true; +} + /* * The madvise(2) system call. * @@ -1521,20 +1542,11 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh size_t len; struct blk_plug plug; - if (!madvise_behavior_valid(behavior)) + if (!is_valid_madvise(start, len_in, behavior)) return -EINVAL; - if (!PAGE_ALIGNED(start)) - return -EINVAL; len = PAGE_ALIGN(len_in); - - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; - end = start + len; - if (end < start) - return -EINVAL; if (end == start) return 0; -- Gitee From 4fed7b9390f510de3f4f4db54f2ec950575107b5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 10:36:13 +0800 Subject: [PATCH 05/12] mm/madvise: split out madvise() behavior execution mainline inclusion from mainline-v6.15-rc1 commit 457753da6462024ad821bcb4df2d828cf2ef18be category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=457753da6462024ad821bcb4df2d828cf2ef18be -------------------------------- Split out the madvise behavior applying logic from do_madvise() to make it easier to reuse from the following change. Link: https://lkml.kernel.org/r/20250206061517.2958-4-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 53 +++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 2b5718ba3e31..cfc9849328fd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1460,6 +1460,35 @@ static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) return true; } +static int madvise_do_behavior(struct mm_struct *mm, + unsigned long start, size_t len_in, size_t len, int behavior) +{ + struct blk_plug plug; + unsigned long end; + int error; + +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_inject_error(behavior, start, start + len_in); +#endif + start = untagged_addr_remote(mm, start); + end = start + len; + + blk_start_plug(&plug); + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + error = madvise_populate(mm, start, end, behavior); + break; + default: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); + break; + } + blk_finish_plug(&plug); + return error; +} + /* * The madvise(2) system call. * @@ -1540,7 +1569,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh unsigned long end; int error; size_t len; - struct blk_plug plug; if (!is_valid_madvise(start, len_in, behavior)) return -EINVAL; @@ -1554,28 +1582,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh error = madvise_lock(mm, behavior); if (error) return error; - -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_inject_error(behavior, start, start + len_in); -#endif - - start = untagged_addr_remote(mm, start); - end = start + len; - - blk_start_plug(&plug); - switch (behavior) { - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - error = madvise_populate(mm, start, end, behavior); - break; - default: - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); - break; - } - blk_finish_plug(&plug); - + error = madvise_do_behavior(mm, start, len_in, len, behavior); madvise_unlock(mm, behavior); return error; -- Gitee From 946d7025b68c79aacbf536d9c3ea6265dbd05976 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 17 Jun 2025 09:55:43 +0800 Subject: [PATCH 06/12] mm/madvise: remove redundant mmap_lock operations from process_madvise() mainline inclusion from mainline-v6.15-rc1 commit 4000e3d0a367c5ff2035a0394b01b93974be6cb1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4000e3d0a367c5ff2035a0394b01b93974be6cb1 -------------------------------- Optimize redundant mmap lock operations from process_madvise() by directly doing the mmap locking first, and then the remaining works for all ranges in the loop. [akpm@linux-foundation.org: update comment, per Lorenzo] Link: https://lkml.kernel.org/r/20250206061517.2958-5-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Vlastimil Babka Signed-off-by: Andrew Morton Conflicts: mm/madvise.c [Context conflicts in madvise.c due to miss commit 662df3e5c37, but we should need this lock operations] Signed-off-by: Wang Lian --- mm/madvise.c | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index cfc9849328fd..c7bb35820458 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1602,13 +1602,52 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, total_len = iov_iter_count(iter); + ret = madvise_lock(mm, behavior); + if (ret) + return ret; + while (iov_iter_count(iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), - iter_iov_len(iter), behavior); + unsigned long start = (unsigned long)iter_iov_addr(iter); + size_t len_in = iter_iov_len(iter); + size_t len; + + if (!is_valid_madvise(start, len_in, behavior)) { + ret = -EINVAL; + break; + } + + len = PAGE_ALIGN(len_in); + if (start + len == start) + ret = 0; + else + ret = madvise_do_behavior(mm, start, len_in, len, + behavior); + /* + * An madvise operation is attempting to restart the syscall, + * but we cannot proceed as it would not be correct to repeat + * the operation in aggregate, and would be surprising to the + * user. + * + * We drop and reacquire locks so it is safe to just loop and + * try again. We check for fatal signals in case we need exit + * early anyway. + */ + if (ret == -ERESTARTNOINTR) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + /* Drop and reacquire lock to unwind race. */ + madvise_unlock(mm, behavior); + madvise_lock(mm, behavior); + continue; + } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } + madvise_unlock(mm, behavior); ret = (total_len - iov_iter_count(iter)) ? : ret; -- Gitee From 817d783c5f37bd29f03cec5c461cd3dc68cd5bcb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 10:56:29 +0800 Subject: [PATCH 07/12] mm/madvise: remove len parameter of madvise_do_behavior() mainline inclusion from mainline-v6.15-rc1 commit be9258a6bf26d2272856214406721762a21aab1b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be9258a6bf26d2272856214406721762a21aab1b -------------------------------- Because madise_should_skip() logic is factored out, making madvise_do_behavior() calculates 'len' on its own rather then receiving it as a parameter makes code simpler. Remove the parameter. Link: https://lkml.kernel.org/r/20250312164750.59215-5-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Reviewed-by: Shakeel Butt Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton Conflicts: mm/madvise.c [Context conflicts in madvise.c due to miss commit 0a6ffacb3b42] Signed-off-by: Wang Lian --- mm/madvise.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index c7bb35820458..7934a0818ca6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1461,7 +1461,7 @@ static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) } static int madvise_do_behavior(struct mm_struct *mm, - unsigned long start, size_t len_in, size_t len, int behavior) + unsigned long start, size_t len_in, int behavior) { struct blk_plug plug; unsigned long end; @@ -1472,7 +1472,7 @@ static int madvise_do_behavior(struct mm_struct *mm, return madvise_inject_error(behavior, start, start + len_in); #endif start = untagged_addr_remote(mm, start); - end = start + len; + end = start + PAGE_ALIGN(len_in); blk_start_plug(&plug); switch (behavior) { @@ -1582,7 +1582,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh error = madvise_lock(mm, behavior); if (error) return error; - error = madvise_do_behavior(mm, start, len_in, len, behavior); + error = madvise_do_behavior(mm, start, len_in, behavior); madvise_unlock(mm, behavior); return error; @@ -1620,8 +1620,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, if (start + len == start) ret = 0; else - ret = madvise_do_behavior(mm, start, len_in, len, - behavior); + ret = madvise_do_behavior(mm, start, len_in, behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat -- Gitee From 1d3de8d15f569ce0c08ed3e2c5ae98595dd25817 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 11:40:00 +0800 Subject: [PATCH 08/12] mm/madvise: define and use madvise_behavior struct for madvise_do_behavior() mainline inclusion from mainline-v6.15-rc1 commit 066c770437835d2bd2072bd2c88a71fcbbd5ccb3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=066c770437835d2bd2072bd2c88a71fcbbd5ccb3 -------------------------------- Patch series "mm/madvise: batch tlb flushes for MADV_DONTNEED and MADV_FREE", v3. When process_madvise() is called to do MADV_DONTNEED[_LOCKED] or MADV_FREE with multiple address ranges, tlb flushes happen for each of the given address ranges. Because such tlb flushes are for the same process, doing those in a batch is more efficient while still being safe. Modify process_madvise() entry level code path to do such batched tlb flushes, while the internal unmap logic do only gathering of the tlb entries to flush. In more detail, modify the entry functions to initialize an mmu_gather object and pass it to the internal logic. And make the internal logic do only gathering of the tlb entries to flush into the received mmu_gather object. After all internal function calls are done, the entry functions flush the gathered tlb entries at once. Because process_madvise() and madvise() share the internal unmap logic, make same change to madvise() entry code together, to make code consistent and cleaner. It is only for keeping the code clean, and shouldn't degrade madvise(). It could rather provide a potential tlb flushes reduction benefit for a case that there are multiple vmas for the given address range. It is only a side effect from an effort to keep code clean, so we don't measure it separately. Similar optimizations might be applicable to other madvise behavior such as MADV_COLD and MADV_PAGEOUT. Those are simply out of the scope of this patch series, though. Patches Sequence ================ The first patch defines a new data structure for managing information that is required for batched tlb flushes (mmu_gather and behavior), and update code paths for MADV_DONTNEED[_LOCKED] and MADV_FREE handling internal logic to receive it. The second patch batches tlb flushes for MADV_FREE handling for both madvise() and process_madvise(). Remaining two patches are for MADV_DONTNEED[_LOCKED] tlb flushes batching. The third patch splits zap_page_range_single() for batching of MADV_DONTNEED[_LOCKED] handling. The fourth patch batches tlb flushes for the hint using the sub-logic that the third patch split out, and the helpers for batched tlb flushes that introduced for the MADV_FREE case, by the second patch. Test Results ============ I measured the latency to apply MADV_DONTNEED advice to 256 MiB memory using multiple process_madvise() calls. I apply the advice in 4 KiB sized regions granularity, but with varying batch size per process_madvise() call (vlen) from 1 to 1024. The source code for the measurement is available at GitHub[1]. To reduce measurement errors, I did the measurement five times. The measurement results are as below. 'sz_batch' column shows the batch size of process_madvise() calls. 'Before' and 'After' columns show the average of latencies in nanoseconds that measured five times on kernels that built without and with the tlb flushes batching of this series (patches 3 and 4), respectively. For the baseline, mm-new tree of 2025-04-09[2] has been used, after reverting the second version of this patch series and adding a temporal fix for !CONFIG_DEBUG_VM build failure[3]. 'B-stdev' and 'A-stdev' columns show ratios of latency measurements standard deviation to average in percent for 'Before' and 'After', respectively. 'Latency_reduction' shows the reduction of the latency that the 'After' has achieved compared to 'Before', in percent. Higher 'Latency_reduction' values mean more efficiency improvements. sz_batch Before B-stdev After A-stdev Latency_reduction 1 146386348 2.78 111327360.6 3.13 23.95 2 108222130 1.54 72131173.6 2.39 33.35 4 93617846.8 2.76 51859294.4 2.50 44.61 8 80555150.4 2.38 44328790 1.58 44.97 16 77272777 1.62 37489433.2 1.16 51.48 32 76478465.2 2.75 33570506 3.48 56.10 64 75810266.6 1.15 27037652.6 1.61 64.34 128 73222748 3.86 25517629.4 3.30 65.15 256 72534970.8 2.31 25002180.4 0.94 65.53 512 71809392 5.12 24152285.4 2.41 66.37 1024 73281170.2 4.53 24183615 2.09 67.00 Unexpectedly the latency has reduced (improved) even with batch size one. I think some of compiler optimizations have affected that, like also observed with the first version of this patch series. So, please focus on the proportion between the improvement and the batch size. As expected, tlb flushes batching provides latency reduction that proportional to the batch size. The efficiency gain ranges from about 33 percent with batch size 2, and up to 67 percent with batch size 1,024. Please note that this is a very simple microbenchmark, so real efficiency gain on real workload could be very different. This patch (of 4): To implement batched tlb flushes for MADV_DONTNEED[_LOCKED] and MADV_FREE, an mmu_gather object in addition to the behavior integer need to be passed to the internal logics. Using a struct can make it easy without increasing the number of parameters of all code paths towards the internal logic. Define a struct for the purpose and use it on the code path that starts from madvise_do_behavior() and ends on madvise_dontneed_free(). Note that this changes madvise_walk_vmas() visitor type signature, too. Specifically, it changes its 'arg' type from 'unsigned long' to the new struct pointer. Link: https://lkml.kernel.org/r/20250410000022.1901-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250410000022.1901-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Rik van Riel Cc: SeongJae Park Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 7934a0818ca6..8ca35a24b128 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -42,6 +42,11 @@ struct madvise_walk_private { bool pageout; }; +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -922,8 +927,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -1076,8 +1082,10 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; @@ -1094,7 +1102,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1289,10 +1297,10 @@ static bool process_madvise_remote_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1350,7 +1358,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1359,7 +1367,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1391,7 +1399,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ @@ -1461,8 +1469,10 @@ static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) } static int madvise_do_behavior(struct mm_struct *mm, - unsigned long start, size_t len_in, int behavior) + unsigned long start, size_t len_in, + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct blk_plug plug; unsigned long end; int error; @@ -1481,7 +1491,7 @@ static int madvise_do_behavior(struct mm_struct *mm, error = madvise_populate(mm, start, end, behavior); break; default: - error = madvise_walk_vmas(mm, start, end, behavior, + error = madvise_walk_vmas(mm, start, end, madv_behavior, madvise_vma_behavior); break; } @@ -1568,6 +1578,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh { unsigned long end; int error; + struct madvise_behavior madv_behavior = {.behavior = behavior}; size_t len; if (!is_valid_madvise(start, len_in, behavior)) @@ -1582,7 +1593,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh error = madvise_lock(mm, behavior); if (error) return error; - error = madvise_do_behavior(mm, start, len_in, behavior); + error = madvise_do_behavior(mm, start, len_in, &madv_behavior); madvise_unlock(mm, behavior); return error; @@ -1599,6 +1610,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, { ssize_t ret = 0; size_t total_len; + struct madvise_behavior madv_behavior = {.behavior = behavior}; total_len = iov_iter_count(iter); @@ -1620,7 +1632,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, if (start + len == start) ret = 0; else - ret = madvise_do_behavior(mm, start, len_in, behavior); + ret = madvise_do_behavior(mm, start, len_in, + &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat -- Gitee From 5be2e47959fd1c5be2dc5aa22bf858e58ecae461 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 12:13:07 +0800 Subject: [PATCH 09/12] mm/madvise: batch tlb flushes for MADV_FREE mainline inclusion from mainline-v6.15-rc1 commit 01bef02bf9301ea6c255b0daa38356e07719dd69 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=01bef02bf9301ea6c255b0daa38356e07719dd69 -------------------------------- MADV_FREE handling for [process_]madvise() flushes tlb for each vma of each address range. Update the logic to do tlb flushes in a batched way. Initialize an mmu_gather object from do_madvise() and vector_madvise(), which are the entry level functions for [process_]madvise(), respectively. And pass those objects to the function for per-vma work, via madvise_behavior struct. Make the per-vma logic not flushes tlb on their own but just saves the tlb entries to the received mmu_gather object. Finally, the entry level functions flush the tlb entries that gathered for the entire user request, at once. Link: https://lkml.kernel.org/r/20250410000022.1901-3-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 57 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 8ca35a24b128..42690d60ece9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -833,12 +833,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static int madvise_free_single_vma(struct vm_area_struct *vma, +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - struct mmu_gather tlb; + struct mmu_gather *tlb = madv_behavior->tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -854,17 +855,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - tlb_start_vma(&tlb, vma); + tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + &madvise_free_walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); - return 0; } @@ -983,7 +981,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(madv_behavior, vma, start, end); else return -EINVAL; } @@ -1447,6 +1445,29 @@ static void madvise_unlock(struct mm_struct *mm, int behavior) mmap_read_unlock(mm); } +static bool madvise_batch_tlb_flush(int behavior) +{ + switch (behavior) { + case MADV_FREE: + return true; + default: + return false; + } +} + +static void madvise_init_tlb(struct madvise_behavior *madv_behavior, + struct mm_struct *mm) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_gather_mmu(madv_behavior->tlb, mm); +} + +static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_finish_mmu(madv_behavior->tlb); +} + static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) { size_t len; @@ -1578,7 +1599,11 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh { unsigned long end; int error; - struct madvise_behavior madv_behavior = {.behavior = behavior}; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; size_t len; if (!is_valid_madvise(start, len_in, behavior)) @@ -1593,7 +1618,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh error = madvise_lock(mm, behavior); if (error) return error; + madvise_init_tlb(&madv_behavior, mm); error = madvise_do_behavior(mm, start, len_in, &madv_behavior); + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); return error; @@ -1610,13 +1637,18 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, { ssize_t ret = 0; size_t total_len; - struct madvise_behavior madv_behavior = {.behavior = behavior}; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; total_len = iov_iter_count(iter); ret = madvise_lock(mm, behavior); if (ret) return ret; + madvise_init_tlb(&madv_behavior, mm); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); @@ -1651,14 +1683,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, } /* Drop and reacquire lock to unwind race. */ + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); madvise_lock(mm, behavior); + madvise_init_tlb(&madv_behavior, mm); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); ret = (total_len - iov_iter_count(iter)) ? : ret; -- Gitee From 9b376e0a0aecc222a68d1694e007e9762845a3dd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 12:24:15 +0800 Subject: [PATCH 10/12] mm/memory: split non-tlb flushing part from zap_page_range_single() mainline inclusion from mainline-v6.15-rc1 commit de8efdf8cd273619121cbc2f0f70dddc74bc586e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de8efdf8cd273619121cbc2f0f70dddc74bc586e -------------------------------- Some of zap_page_range_single() callers such as [process_]madvise() with MADV_DONTNEED[_LOCKED] cannot batch tlb flushes because zap_page_range_single() flushes tlb for each invocation. Split out the body of zap_page_range_single() except mmu_gather object initialization and gathered tlb entries flushing for such batched tlb flushing usage. To avoid hugetlb pages allocation failures from concurrent page faults, the tlb flush should be done before hugetlb faults unlocking, though. Do the flush and the unlock inside the split out function in the order for hugetlb vma case. Refer to commit 2820b0f09be9 ("hugetlbfs: close race between MADV_DONTNEED and page fault") for more details about the concurrent faults' page allocation failure problem. Link: https://lkml.kernel.org/r/20250410000022.1901-4-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Signed-off-by: Wang Lian --- mm/memory.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index abc7f1645831..3acd3e5aaf59 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1913,36 +1913,65 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, mmu_notifier_invalidate_range_end(&range); } -/** - * zap_page_range_single - remove user pages in a given range +/* + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +static void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; + + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** -- Gitee From 7d86b49feea2646e976a4c8264664586f4134eaf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 12:34:05 +0800 Subject: [PATCH 11/12] mm/madvise: batch tlb flushes for MADV_DONTNEED[_LOCKED] mainline inclusion from mainline-v6.15-rc1 commit 43c4cfde7e37460482e41dbb6837d4bff290d2cc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=43c4cfde7e37460482e41dbb6837d4bff290d2cc -------------------------------- MADV_DONTNEED[_LOCKED] handling for [process_]madvise() flushes tlb for each vma of each address range. Update the logic to do tlb flushes in a batched way. Initialize an mmu_gather object from do_madvise() and vector_madvise(), which are the entry level functions for [process_]madvise(), respectively. And pass those objects to the function for per-vma work, via madvise_behavior struct. Make the per-vma logic not flushes tlb on their own but just saves the tlb entries to the received mmu_gather object. For this internal logic change, make zap_page_range_single_batched() non-static and use it directly from madvise_dontneed_single_vma(). Finally, the entry level functions flush the tlb entries that gathered for the entire user request, at once. Link: https://lkml.kernel.org/r/20250410000022.1901-5-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton MADV_DONTNEED[_LOCKED] handling for [process_]madvise() flushes tlb for each vma of each address range. Update the logic to do tlb flushes in a batched way. Initialize an mmu_gather object from do_madvise() and vector_madvise(), which are the entry level functions for [process_]madvise(), respectively. And pass those objects to the function for per-vma work, via madvise_behavior struct. Make the per-vma logic not flushes tlb on their own but just saves the tlb entries to the received mmu_gather object. For this internal logic change, make zap_page_range_single_batched() non-static and use it directly from madvise_dontneed_single_vma(). Finally, the entry level functions flush the tlb entries that gathered for the entire user request, at once. Link: https://lkml.kernel.org/r/20250410000022.1901-5-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton Conflicts: mm/madvise.c [Context conflicts in madvise.c due to miss commit 6375e95f381e] Signed-off-by: Wang Lian --- mm/internal.h | 3 +++ mm/madvise.c | 11 ++++++++--- mm/memory.c | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 46c5a8da9d72..ff96a9f53f01 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -342,6 +342,9 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long size, struct zap_details *details); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); diff --git a/mm/madvise.c b/mm/madvise.c index 42690d60ece9..5572c894595a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -885,10 +885,12 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + zap_page_range_single_batched( + madv_behavior->tlb, vma, start, end - start, NULL); return 0; } @@ -979,7 +981,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma( + madv_behavior, vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(madv_behavior, vma, start, end); else @@ -1448,6 +1451,8 @@ static void madvise_unlock(struct mm_struct *mm, int behavior) static bool madvise_batch_tlb_flush(int behavior) { switch (behavior) { + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: case MADV_FREE: return true; default: diff --git a/mm/memory.c b/mm/memory.c index 3acd3e5aaf59..91ea3d2d4f87 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1913,7 +1913,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, mmu_notifier_invalidate_range_end(&range); } -/* +/** * zap_page_range_single_batched - remove user pages in a given range * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages @@ -1924,7 +1924,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for * hugetlb, @tlb is flushed and re-initialized by this function. */ -static void zap_page_range_single_batched(struct mmu_gather *tlb, +void zap_page_range_single_batched(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { -- Gitee From 9ff82f272757ee77ee29075d5c681a74d336634e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 17 Jun 2025 11:15:47 +0800 Subject: [PATCH 12/12] mm/madvise: handle madvise_lock() failure during race unwinding mainline inclusion from mainline-v6.16-rc1 commit 9c49e5d09f076001e05537734d7df002162eb2b5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEC2V Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9c49e5d09f076001e05537734d7df002162eb2b5 -------------------------------- When unwinding race on -ERESTARTNOINTR handling of process_madvise(), madvise_lock() failure is ignored. Check the failure and abort remaining works in the case. Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()") Signed-off-by: SeongJae Park Reported-by: Barry Song <21cnbao@gmail.com> Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com Reviewed-by: Jann Horn Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Shakeel Butt Reviewed-by: Barry Song Cc: Liam Howlett Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Wang Lian --- mm/madvise.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 5572c894595a..bfbb48519259 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1690,7 +1690,9 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, /* Drop and reacquire lock to unwind race. */ madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); - madvise_lock(mm, behavior); + ret = madvise_lock(mm, behavior); + if (ret) + goto out; madvise_init_tlb(&madv_behavior, mm); continue; } @@ -1701,6 +1703,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); +out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; -- Gitee