From 5a78590b0b4a0a1d42f82e11d43230e1d38eb0c2 Mon Sep 17 00:00:00 2001 From: Qiao Ma Date: Tue, 15 Oct 2024 14:01:48 +0800 Subject: [PATCH 1/5] uprobe: avoid out-of-bounds memory access of fetching args mainline inclusion from mainline-v6.12-rc5 commit 373b9338c9722a368925d83bc622c596896b328e category: bugfix issue: #IB2ZTL CVE: CVE-2024-50067 Signed-off-by: zhangshuqi --------------------------------------- Uprobe needs to fetch args into a percpu buffer, and then copy to ring buffer to avoid non-atomic context problem. Sometimes user-space strings, arrays can be very large, but the size of percpu buffer is only page size. And store_trace_args() won't check whether these data exceeds a single page or not, caused out-of-bounds memory access. It could be reproduced by following steps: 1. build kernel with CONFIG_KASAN enabled 2. save follow program as test.c ``` \#include \#include \#include // If string length large than MAX_STRING_SIZE, the fetch_store_strlen() // will return 0, cause __get_data_size() return shorter size, and // store_trace_args() will not trigger out-of-bounds access. // So make string length less than 4096. \#define STRLEN 4093 void generate_string(char *str, int n) { int i; for (i = 0; i < n; ++i) { char c = i % 26 + 'a'; str[i] = c; } str[n-1] = '\0'; } void print_string(char *str) { printf("%s\n", str); } int main() { char tmp[STRLEN]; generate_string(tmp, STRLEN); print_string(tmp); return 0; } ``` 3. compile program `gcc -o test test.c` 4. get the offset of `print_string()` ``` objdump -t test | grep -w print_string 0000000000401199 g F .text 000000000000001b print_string ``` 5. configure uprobe with offset 0x1199 ``` off=0x1199 cd /sys/kernel/debug/tracing/ echo "p /root/test:${off} arg1=+0(%di):ustring arg2=\$comm arg3=+0(%di):ustring" > uprobe_events echo 1 > events/uprobes/enable echo 1 > tracing_on ``` 6. run `test`, and kasan will report error. ================================================================== BUG: KASAN: use-after-free in strncpy_from_user+0x1d6/0x1f0 Write of size 8 at addr ffff88812311c004 by task test/499CPU: 0 UID: 0 PID: 499 Comm: test Not tainted 6.12.0-rc3+ #18 Hardware name: Red Hat KVM, BIOS 1.16.0-4.al8 04/01/2014 Call Trace: dump_stack_lvl+0x55/0x70 print_address_description.constprop.0+0x27/0x310 kasan_report+0x10f/0x120 ? strncpy_from_user+0x1d6/0x1f0 strncpy_from_user+0x1d6/0x1f0 ? rmqueue.constprop.0+0x70d/0x2ad0 process_fetch_insn+0xb26/0x1470 ? __pfx_process_fetch_insn+0x10/0x10 ? _raw_spin_lock+0x85/0xe0 ? __pfx__raw_spin_lock+0x10/0x10 ? __pte_offset_map+0x1f/0x2d0 ? unwind_next_frame+0xc5f/0x1f80 ? arch_stack_walk+0x68/0xf0 ? is_bpf_text_address+0x23/0x30 ? kernel_text_address.part.0+0xbb/0xd0 ? __kernel_text_address+0x66/0xb0 ? unwind_get_return_address+0x5e/0xa0 ? __pfx_stack_trace_consume_entry+0x10/0x10 ? arch_stack_walk+0xa2/0xf0 ? _raw_spin_lock_irqsave+0x8b/0xf0 ? __pfx__raw_spin_lock_irqsave+0x10/0x10 ? depot_alloc_stack+0x4c/0x1f0 ? _raw_spin_unlock_irqrestore+0xe/0x30 ? stack_depot_save_flags+0x35d/0x4f0 ? kasan_save_stack+0x34/0x50 ? kasan_save_stack+0x24/0x50 ? mutex_lock+0x91/0xe0 ? __pfx_mutex_lock+0x10/0x10 prepare_uprobe_buffer.part.0+0x2cd/0x500 uprobe_dispatcher+0x2c3/0x6a0 ? __pfx_uprobe_dispatcher+0x10/0x10 ? __kasan_slab_alloc+0x4d/0x90 handler_chain+0xdd/0x3e0 handle_swbp+0x26e/0x3d0 ? __pfx_handle_swbp+0x10/0x10 ? uprobe_pre_sstep_notifier+0x151/0x1b0 irqentry_exit_to_user_mode+0xe2/0x1b0 asm_exc_int3+0x39/0x40 RIP: 0033:0x401199 Code: 01 c2 0f b6 45 fb 88 02 83 45 fc 01 8b 45 fc 3b 45 e4 7c b7 8b 45 e4 48 98 48 8d 50 ff 48 8b 45 e8 48 01 d0 ce RSP: 002b:00007ffdf00576a8 EFLAGS: 00000206 RAX: 00007ffdf00576b0 RBX: 0000000000000000 RCX: 0000000000000ff2 RDX: 0000000000000ffc RSI: 0000000000000ffd RDI: 00007ffdf00576b0 RBP: 00007ffdf00586b0 R08: 00007feb2f9c0d20 R09: 00007feb2f9c0d20 R10: 0000000000000001 R11: 0000000000000202 R12: 0000000000401040 R13: 00007ffdf0058780 R14: 0000000000000000 R15: 0000000000000000 This commit enforces the buffer's maxlen less than a page-size to avoid store_trace_args() out-of-memory access. Link: https://lore.kernel.org/all/20241015060148.1108331-1-mqaio@linux.alibaba.com/ Fixes: dcad1a204f72 ("tracing/uprobes: Fetch args before reserving a ring buffer") Signed-off-by: Qiao Ma Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: zhangshuqi --- kernel/trace/trace_uprobe.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 60ff36f5d7f9..36471bbe5938 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -857,6 +857,7 @@ struct uprobe_cpu_buffer { }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; +#define MAX_UCB_BUFFER_SIZE PAGE_SIZE static int uprobe_buffer_init(void) { @@ -957,9 +958,6 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) - return; - if (trace_trigger_soft_disabled(trace_file)) return; @@ -1502,6 +1500,10 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); + + if (WARN_ON_ONCE(tu->tp.size + dsize > MAX_UCB_BUFFER_SIZE)) + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) @@ -1537,6 +1539,10 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); + + if (WARN_ON_ONCE(tu->tp.size + dsize > MAX_UCB_BUFFER_SIZE)) + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) -- Gitee From a61c33a44f6e0f4df06c81bf5741a0f0090f8d07 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 21:04:35 -0400 Subject: [PATCH 2/5] lib/generic-radix-tree.c: Fix rare race in __genradix_ptr_alloc() mainline inclusion from mainline-5.10.226 commit 99418ec776a39609f50934720419e0b464ca2283 category: bugfix issue: #IB2ZTL CVE: CVE-2024-47668 Signed-off-by: zhangshuqi --------------------------------------- [ Upstream commit b2f11c6f3e1fc60742673b8675c95b78447f3dae ] If we need to increase the tree depth, allocate a new node, and then race with another thread that increased the tree depth before us, we'll still have a preallocated node that might be used later. If we then use that node for a new non-root node, it'll still have a pointer to the old root instead of being zeroed - fix this by zeroing it in the cmpxchg failure path. Signed-off-by: Kent Overstreet Signed-off-by: Sasha Levin Signed-off-by: zhangshuqi --- lib/generic-radix-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index 7dfa88282b00..78f081d695d0 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -131,6 +131,8 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; + } else { + new_node->children[0] = NULL; } } -- Gitee From 04e667c0e2e2f9b31ef7f5975135ecfdcd20d6a5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 12 May 2024 13:30:07 +0200 Subject: [PATCH 3/5] fsnotify: clear PARENT_WATCHED flags lazily mainline inclusion from mainline-5.10.226 commit 3f3ef1d9f66b93913ce2171120d9226b55acd41d category: bugfix issue: #IB2ZTL CVE: CVE-2024-47660 Signed-off-by: zhangshuqi --------------------------------------- [ Upstream commit 172e422ffea20a89bfdc672741c1aad6fbb5044e ] In some setups directories can have many (usually negative) dentries. Hence __fsnotify_update_child_dentry_flags() function can take a significant amount of time. Since the bulk of this function happens under inode->i_lock this causes a significant contention on the lock when we remove the watch from the directory as the __fsnotify_update_child_dentry_flags() call from fsnotify_recalc_mask() races with __fsnotify_update_child_dentry_flags() calls from __fsnotify_parent() happening on children. This can lead upto softlockup reports reported by users. Fix the problem by calling fsnotify_update_children_dentry_flags() to set PARENT_WATCHED flags only when parent starts watching children. When parent stops watching children, clear false positive PARENT_WATCHED flags lazily in __fsnotify_parent() for each accessed child. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Stephen Brennan Signed-off-by: Jan Kara Signed-off-by: Sasha Levin Signed-off-by: zhangshuqi --- fs/notify/fsnotify.c | 31 +++++++++++++++++++++---------- fs/notify/fsnotify.h | 2 +- fs/notify/mark.c | 32 +++++++++++++++++++++++++++++--- include/linux/fsnotify_backend.h | 8 +++++--- 4 files changed, 56 insertions(+), 17 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 30d422b8c0fc..87f733193dc7 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -105,17 +105,13 @@ void fsnotify_sb_delete(struct super_block *sb) * parent cares. Thus when an event happens on a child it can quickly tell if * if there is a need to find a parent and send the event to the parent. */ -void __fsnotify_update_child_dentry_flags(struct inode *inode) +void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; - int watched; if (!S_ISDIR(inode->i_mode)) return; - /* determine if the children should tell inode about their events */ - watched = fsnotify_inode_watches_children(inode); - spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ @@ -131,10 +127,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); - if (watched) - child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; - else - child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); @@ -142,6 +135,24 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) spin_unlock(&inode->i_lock); } +/* + * Lazily clear false positive PARENT_WATCHED flag for child whose parent had + * stopped watching children. + */ +static void fsnotify_clear_child_dentry_flag(struct inode *pinode, + struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + /* + * d_lock is a sufficient barrier to prevent observing a non-watched + * parent state from before the fsnotify_set_children_dentry_flags() + * or fsnotify_update_flags() call that had set PARENT_WATCHED. + */ + if (!fsnotify_inode_watches_children(pinode)) + dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + spin_unlock(&dentry->d_lock); +} + /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt, __u32 mask) @@ -210,7 +221,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) - __fsnotify_update_child_dentry_flags(p_inode); + fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index ff2063ec6b0f..f7da9964909c 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -59,7 +59,7 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ -extern void __fsnotify_update_child_dentry_flags(struct inode *inode); +extern void fsnotify_set_children_dentry_flags(struct inode *inode); /* allocate and destroy and event holder to attach events to notification/access queues */ extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 5b44be5f93dd..65dae417cabb 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -132,6 +132,24 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) *fsnotify_conn_mask_p(conn) = new_mask; } +static bool fsnotify_conn_watches_children( + struct fsnotify_mark_connector *conn) +{ + if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) + return false; + + return fsnotify_inode_watches_children(fsnotify_conn_inode(conn)); +} + +static void fsnotify_conn_set_children_dentry_flags( + struct fsnotify_mark_connector *conn) +{ + if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) + return; + + fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn)); +} + /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve @@ -140,15 +158,23 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { + bool update_children; + if (!conn) return; spin_lock(&conn->lock); + update_children = !fsnotify_conn_watches_children(conn); __fsnotify_recalc_mask(conn); + update_children &= fsnotify_conn_watches_children(conn); spin_unlock(&conn->lock); - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - __fsnotify_update_child_dentry_flags( - fsnotify_conn_inode(conn)); + /* + * Set children's PARENT_WATCHED flags only if parent started watching. + * When parent stops watching, we clear false positive PARENT_WATCHED + * flags lazily in __fsnotify_parent(). + */ + if (update_children) + fsnotify_conn_set_children_dentry_flags(conn); } /* Free all connectors queued for freeing once SRCU period ends */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index a2e42d3cd87c..ffb1bafb044e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -437,12 +437,14 @@ static inline __u32 fsnotify_parent_needed_mask(__u32 mask) static inline int fsnotify_inode_watches_children(struct inode *inode) { + __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask); + /* FS_EVENT_ON_CHILD is set if the inode may care */ - if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) + if (!(parent_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ - return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; + return parent_mask & FS_EVENTS_POSS_ON_CHILD; } /* @@ -456,7 +458,7 @@ static inline void fsnotify_update_flags(struct dentry *dentry) /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken - * d_lock, the following __fsnotify_update_child_dentry_flags call will + * d_lock, the following fsnotify_set_children_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ -- Gitee From f28bba33e8c82dc11dfd41d1a22765f9684f7543 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:57:29 -0700 Subject: [PATCH 4/5] mm: add remap_pfn_range_notrack mainline inclusion from mainline-5.13 commit 74ffa5a3e68504dd289135b1cf0422c19ffb3f2e category: bugfix issue: #IB2ZTL CVE: CVE-2024-47674 Signed-off-by: zhangshuqi --------------------------------------- Patch series "add remap_pfn_range_notrack instead of reinventing it in i915", v2. i915 has some reason to want to avoid the track_pfn_remap overhead in remap_pfn_range. Add a function to the core VM to do just that rather than reinventing the functionality poorly in the driver. Note that the remap_io_sg path does get exercises when using Xorg on my Thinkpad X1, so this should be considered lightly tested, I've not managed to hit the remap_io_mapping path at all. This patch (of 4): Add a version of remap_pfn_range that does not call track_pfn_range. This will be used to fix horrible abuses of VM internals in the i915 driver. Link: https://lkml.kernel.org/r/20210326055505.1424432-1-hch@lst.de Link: https://lkml.kernel.org/r/20210326055505.1424432-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Chris Wilson Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: zhangshuqi --- include/linux/mm.h | 2 ++ mm/memory.c | 51 ++++++++++++++++++++++++++++------------------ 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ed1be47c8cb..181830659484 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2792,6 +2792,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); diff --git a/mm/memory.c b/mm/memory.c index ecfeda3db9ff..756907e5ddf2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2295,26 +2295,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; - unsigned long remap_pfn = pfn; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) @@ -2344,10 +2335,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); - if (err) - return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); @@ -2359,12 +2346,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) - untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); + return -EINVAL; + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (err) + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); -- Gitee From 1bfbcbd2e26248f05c51e6f2f4070e23b8a4bc55 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 11 Sep 2024 17:11:23 -0700 Subject: [PATCH 5/5] mm: avoid leaving partial pfn mappings around in error case mainline inclusion from mainline-6.11 commit 79a61cc3fc0466ad2b7b89618a6157785f0293b3 category: bugfix issue: #IB2ZTL CVE: CVE-2024-47674 Signed-off-by: zhangshuqi --------------------------------------- As Jann points out, PFN mappings are special, because unlike normal memory mappings, there is no lifetime information associated with the mapping - it is just a raw mapping of PFNs with no reference counting of a 'struct page'. That's all very much intentional, but it does mean that it's easy to mess up the cleanup in case of errors. Yes, a failed mmap() will always eventually clean up any partial mappings, but without any explicit lifetime in the page table mapping itself, it's very easy to do the error handling in the wrong order. In particular, it's easy to mistakenly free the physical backing store before the page tables are actually cleaned up and (temporarily) have stale dangling PTE entries. To make this situation less error-prone, just make sure that any partial pfn mapping is torn down early, before any other error handling. Reported-and-tested-by: Jann Horn Cc: Andrew Morton Cc: Jason Gunthorpe Cc: Simona Vetter Signed-off-by: Linus Torvalds Signed-off-by: zhangshuqi --- mm/memory.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 756907e5ddf2..5bf3bb445fd5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2295,11 +2295,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/* - * Variant of remap_pfn_range that does not call track_pfn_remap. The caller - * must have pre-validated the caching bits of the pgprot_t. - */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; @@ -2352,6 +2348,27 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return 0; } +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. + */ +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); + + if (!error) + return 0; + + /* + * A partial pfn range mapping is dangerous: it does not + * maintain page reference counts, and callers may free + * pages due to the error. So zap it early. + */ + zap_page_range_single(vma, addr, size, NULL); + return error; +} + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to -- Gitee