From 3f319e7e26cbd93293e6a8b09589ac27411cba32 Mon Sep 17 00:00:00 2001 From: Daniel Stodden Date: Sun, 22 Dec 2024 19:39:08 -0800 Subject: [PATCH 01/13] PCI/ASPM: Fix link state exit during switch upstream function removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.236 commit 62db339ecc3d58d8fd83a9e4d80061cd943bb6e2 category: bugfix issue: #ICK5MI CVE: CVE-2024-58093 Signed-off-by: Tengda Wu --------------------------------------- [ Upstream commit cbf937dcadfd571a434f8074d057b32cd14fbea5 ] Before 456d8aa37d0f ("PCI/ASPM: Disable ASPM on MFD function removal to avoid use-after-free"), we would free the ASPM link only after the last function on the bus pertaining to the given link was removed. That was too late. If function 0 is removed before sibling function, link->downstream would point to free'd memory after. After above change, we freed the ASPM parent link state upon any function removal on the bus pertaining to a given link. That is too early. If the link is to a PCIe switch with MFD on the upstream port, then removing functions other than 0 first would free a link which still remains parent_link to the remaining downstream ports. The resulting GPFs are especially frequent during hot-unplug, because pciehp removes devices on the link bus in reverse order. On that switch, function 0 is the virtual P2P bridge to the internal bus. Free exactly when function 0 is removed -- before the parent link is obsolete, but after all subordinate links are gone. Link: https://lore.kernel.org/r/e12898835f25234561c9d7de4435590d957b85d9.1734924854.git.dns@arista.com Fixes: 456d8aa37d0f ("PCI/ASPM: Disable ASPM on MFD function removal to avoid use-after-free") Signed-off-by: Daniel Stodden Signed-off-by: Bjorn Helgaas [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński Signed-off-by: Sasha Levin --- drivers/pci/pcie/aspm.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 8ab8abd79e89..94b0b32340a8 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1014,16 +1014,16 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev) parent_link = link->parent; /* - * link->downstream is a pointer to the pci_dev of function 0. If - * we remove that function, the pci_dev is about to be deallocated, - * so we can't use link->downstream again. Free the link state to - * avoid this. + * Free the parent link state, no later than function 0 (i.e. + * link->downstream) being removed. * - * If we're removing a non-0 function, it's possible we could - * retain the link state, but PCIe r6.0, sec 7.5.3.7, recommends - * programming the same ASPM Control value for all functions of - * multi-function devices, so disable ASPM for all of them. + * Do not free the link state any earlier. If function 0 is a + * switch upstream port, this link state is parent_link to all + * subordinate ones. */ + if (pdev != link->downstream) + goto out; + pcie_config_aspm_link(link, 0); list_del(&link->sibling); free_link_state(link); @@ -1034,6 +1034,7 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev) pcie_config_aspm_path(parent_link); } + out: mutex_unlock(&aspm_lock); up_read(&pci_bus_sem); } -- Gitee From f1120b5867592e13b151a52164437fc95be62e2b Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 17 Apr 2025 15:25:08 +0800 Subject: [PATCH 02/13] nfs: handle failure of nfs_get_lock_context in unlock path stable inclusion from stable-v5.10.238 commit 4c189fd40a09a03f9a900bedb2d9064f1734d72a category: bugfix issue: #ICK5MI CVE: CVE-2025-38023 Signed-off-by: Tengda Wu --------------------------------------- [ Upstream commit c457dc1ec770a22636b473ce5d35614adfe97636 ] When memory is insufficient, the allocation of nfs_lock_context in nfs_get_lock_context() fails and returns -ENOMEM. If we mistakenly treat an nfs4_unlockdata structure (whose l_ctx member has been set to -ENOMEM) as valid and proceed to execute rpc_run_task(), this will trigger a NULL pointer dereference in nfs4_locku_prepare. For example: BUG: kernel NULL pointer dereference, address: 000000000000000c PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 15 UID: 0 PID: 12 Comm: kworker/u64:0 Not tainted 6.15.0-rc2-dirty #60 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 Workqueue: rpciod rpc_async_schedule RIP: 0010:nfs4_locku_prepare+0x35/0xc2 Code: 89 f2 48 89 fd 48 c7 c7 68 69 ef b5 53 48 8b 8e 90 00 00 00 48 89 f3 RSP: 0018:ffffbbafc006bdb8 EFLAGS: 00010246 RAX: 000000000000004b RBX: ffff9b964fc1fa00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: fffffffffffffff4 RDI: ffff9ba53fddbf40 RBP: ffff9ba539934000 R08: 0000000000000000 R09: ffffbbafc006bc38 R10: ffffffffb6b689c8 R11: 0000000000000003 R12: ffff9ba539934030 R13: 0000000000000001 R14: 0000000004248060 R15: ffffffffb56d1c30 FS: 0000000000000000(0000) GS:ffff9ba5881f0000(0000) knlGS:00000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000000c CR3: 000000093f244000 CR4: 00000000000006f0 Call Trace: __rpc_execute+0xbc/0x480 rpc_async_schedule+0x2f/0x40 process_one_work+0x232/0x5d0 worker_thread+0x1da/0x3d0 ? __pfx_worker_thread+0x10/0x10 kthread+0x10d/0x240 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Modules linked in: CR2: 000000000000000c ---[ end trace 0000000000000000 ]--- Free the allocated nfs4_unlockdata when nfs_get_lock_context() fails and return NULL to terminate subsequent rpc_run_task, preventing NULL pointer dereference. Fixes: f30cb757f680 ("NFS: Always wait for I/O completion before unlock") Signed-off-by: Li Lingfeng Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20250417072508.3850532-1-lilingfeng3@huawei.com Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs4proc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f391f01a7fd4..0399373f8a67 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6732,10 +6732,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -6743,7 +6751,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); -- Gitee From 8e3481c18f2937c795eb69b19fed918c28fdecb8 Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Sun, 18 May 2025 18:41:02 -0400 Subject: [PATCH 03/13] crypto: algif_hash - fix double free in hash_accept stable inclusion from stable-v5.10.238 commit bf7bba75b91539e93615f560893a599c1e1c98bf category: bugfix issue: #ICK5MI CVE: CVE-2025-38079 Signed-off-by: Tengda Wu --------------------------------------- commit b2df03ed4052e97126267e8c13ad4204ea6ba9b6 upstream. If accept(2) is called on socket type algif_hash with MSG_MORE flag set and crypto_ahash_import fails, sk2 is freed. However, it is also freed in af_alg_release, leading to slab-use-after-free error. Fixes: fe869cdb89c9 ("crypto: algif_hash - User-space interface for hash operations") Cc: Signed-off-by: Ivan Pravdin Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/algif_hash.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 50f7b22f1b48..be21cfdc6dbc 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -262,10 +262,6 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags, return err; err = crypto_ahash_import(&ctx2->req, state); - if (err) { - sock_orphan(sk2); - sock_put(sk2); - } return err; } -- Gitee From c9de196ff44f3242f1d6daf0bd427609a1e04cff Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Apr 2025 20:22:08 +0800 Subject: [PATCH 04/13] f2fs: fix to do sanity check on sbi->total_valid_block_count stable inclusion from stable-v5.10.239 commit f1b743c1955151bd392539b739a3ad155296be13 category: bugfix issue: #ICK5MI CVE: CVE-2025-38163 Signed-off-by: Tengda Wu --------------------------------------- [ Upstream commit 05872a167c2cab80ef186ef23cc34a6776a1a30c ] syzbot reported a f2fs bug as below: ------------[ cut here ]------------ kernel BUG at fs/f2fs/f2fs.h:2521! RIP: 0010:dec_valid_block_count+0x3b2/0x3c0 fs/f2fs/f2fs.h:2521 Call Trace: f2fs_truncate_data_blocks_range+0xc8c/0x11a0 fs/f2fs/file.c:695 truncate_dnode+0x417/0x740 fs/f2fs/node.c:973 truncate_nodes+0x3ec/0xf50 fs/f2fs/node.c:1014 f2fs_truncate_inode_blocks+0x8e3/0x1370 fs/f2fs/node.c:1197 f2fs_do_truncate_blocks+0x840/0x12b0 fs/f2fs/file.c:810 f2fs_truncate_blocks+0x10d/0x300 fs/f2fs/file.c:838 f2fs_truncate+0x417/0x720 fs/f2fs/file.c:888 f2fs_setattr+0xc4f/0x12f0 fs/f2fs/file.c:1112 notify_change+0xbca/0xe90 fs/attr.c:552 do_truncate+0x222/0x310 fs/open.c:65 handle_truncate fs/namei.c:3466 [inline] do_open fs/namei.c:3849 [inline] path_openat+0x2e4f/0x35d0 fs/namei.c:4004 do_filp_open+0x284/0x4e0 fs/namei.c:4031 do_sys_openat2+0x12b/0x1d0 fs/open.c:1429 do_sys_open fs/open.c:1444 [inline] __do_sys_creat fs/open.c:1522 [inline] __se_sys_creat fs/open.c:1516 [inline] __x64_sys_creat+0x124/0x170 fs/open.c:1516 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/syscall_64.c:94 The reason is: in fuzzed image, sbi->total_valid_block_count is inconsistent w/ mapped blocks indexed by inode, so, we should not trigger panic for such case, instead, let's print log and set fsck flag. Fixes: 39a53e0ce0df ("f2fs: add superblock and major in-memory structure") Reported-by: syzbot+8b376a77b2f364097fbe@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/67f3c0b2.050a0220.396535.0547.GAE@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/f2fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2c611f112171..67acfdd15dbe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2094,8 +2094,14 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - sbi->total_valid_block_count -= (block_t)count; + if (unlikely(sbi->total_valid_block_count < count)) { + f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u", + sbi->total_valid_block_count, inode->i_ino, count); + sbi->total_valid_block_count = 0; + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count -= count; + } if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks = min(sbi->reserved_blocks, -- Gitee From 812193e0537f12f57b0aecec92c0045753815b55 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:43 -0700 Subject: [PATCH 05/13] bpf: fix potential 32-bit overflow when accessing ARRAY map element mainline inclusion from mainline-v6.0-rc1 commit 87ac0d600943994444e24382a87aa19acc4cd3d4 category: bugfix issue: #ICK5MI CVE: CVE-2022-50167 Signed-off-by: Tengda Wu --------------------------------------- If BPF array map is bigger than 4GB, element pointer calculation can overflow because both index and elem_size are u32. Fix this everywhere by forcing 64-bit multiplication. Extract this formula into separate small helper and use it consistently in various places. Speculative-preventing formula utilizing index_mask trick is left as is, but explicit u64 casts are added in both places. Fixes: c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5102338129d5..f1f2504866af 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -171,6 +171,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return &array->map; } +static void *array_map_elem_ptr(struct bpf_array* array, u32 index) +{ + return array->value + (u64)array->elem_size * index; +} + /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { @@ -180,7 +185,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * (index & array->index_mask); + return array->value + (u64)array->elem_size * (index & array->index_mask); } static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, @@ -332,7 +337,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, value, map->value_size); } else { val = array->value + - array->elem_size * (index & array->index_mask); + (u64)array->elem_size * (index & array->index_mask); if (map_flags & BPF_F_LOCK) copy_map_value_locked(map, val, value, false); else @@ -527,7 +532,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -546,7 +551,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) -- Gitee From 64d973e5c76d74e1d31e179da7e27fcb3db5614b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 4 Aug 2022 10:21:19 +0100 Subject: [PATCH 06/13] sched/core: Do not requeue task on CPU excluded from cpus_mask mainline inclusion from mainline-v6.0-rc1 commit 751d4cbc43879229dbc124afefe240b70fd29a85 category: bugfix issue: #ICK5MI CVE: CVE-2022-50100 Signed-off-by: Tengda Wu --------------------------------------- The following warning was triggered on a large machine early in boot on a distribution kernel but the same problem should also affect mainline. WARNING: CPU: 439 PID: 10 at ../kernel/workqueue.c:2231 process_one_work+0x4d/0x440 Call Trace: rescuer_thread+0x1f6/0x360 kthread+0x156/0x180 ret_from_fork+0x22/0x30 Commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") optimises ttwu by queueing a task that is descheduling on the wakelist, but does not check if the task descheduling is still allowed to run on that CPU. In this warning, the problematic task is a workqueue rescue thread which checks if the rescue is for a per-cpu workqueue and running on the wrong CPU. While this is early in boot and it should be possible to create workers, the rescue thread may still used if the MAYDAY_INITIAL_TIMEOUT is reached or MAYDAY_INTERVAL and on a sufficiently large machine, the rescue thread is being used frequently. Tracing confirmed that the task should have migrated properly using the stopper thread to handle the migration. However, a parallel wakeup from udev running on another CPU that does not share CPU cache observes p->on_cpu and uses task_cpu(p), queues the task on the old CPU and triggers the warning. Check that the wakee task that is descheduling is still allowed to run on its current CPU and if not, wait for the descheduling to complete and select an allowed CPU. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220804092119.20137-1-mgorman@techsingularity.net --- kernel/sched/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 49ed67b62e9f..5e4269a70c44 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2783,8 +2783,12 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static inline bool ttwu_queue_cond(int cpu, int wake_flags) +static inline bool ttwu_queue_cond(struct task_struct *p, int cpu, int wake_flags) { + /* Ensure the task will still be allowed to run on the CPU. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. @@ -2810,7 +2814,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu, wake_flags)) { if (WARN_ON_ONCE(cpu == smp_processor_id())) return false; -- Gitee From 198b8a453377adc6d8421ab49637cecc1d7f0690 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:35 -0800 Subject: [PATCH 07/13] bpf: Use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem mainline inclusion from mainline-v5.18-rc1 commit 3486bedd99196ecdfe99c0ab5b67ad3c47e8a8fa category: bugfix issue: #ICK5MI CVE: CVE-2022-49967 Signed-off-by: Tengda Wu --------------------------------------- This enables sub-page memory charge and allocation. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-3-song@kernel.org --- include/linux/bpf.h | 4 ++-- kernel/bpf/core.c | 17 ++++++++--------- kernel/bpf/trampoline.c | 6 +++--- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 112a4d2ed754..16e6d95c285b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -772,8 +772,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -int bpf_jit_charge_modmem(u32 pages); -void bpf_jit_uncharge_modmem(u32 pages); +int bpf_jit_charge_modmem(u32 size); +void bpf_jit_uncharge_modmem(u32 size); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 95f0085ac71f..a1ee727a0d6e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -836,12 +836,11 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(pages, &bpf_jit_current) > - (bpf_jit_limit >> PAGE_SHIFT)) { + if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { if (!bpf_capable()) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } @@ -849,9 +848,9 @@ int bpf_jit_charge_modmem(u32 pages) return 0; } -void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 size) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) @@ -882,11 +881,11 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); pages = size / PAGE_SIZE; - if (bpf_jit_charge_modmem(pages)) + if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); return NULL; } @@ -909,7 +908,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) u32 pages = hdr->pages; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(pages << PAGE_SHIFT); } /* This symbol is only overridden by archs that have different diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 87becf77cc75..4ddbf3d75737 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -172,7 +172,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -269,7 +269,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) if (!im) goto out; - err = bpf_jit_charge_modmem(1); + err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; @@ -291,7 +291,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) out_free_image: bpf_jit_free_exec(im->image); out_uncharge: - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: -- Gitee From 9810827a6bdb7aadeb8006aad451d2ff1e9c6c6d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 14:58:04 -0700 Subject: [PATCH 08/13] bpf: Fix a data-race around bpf_jit_limit. mainline inclusion from mainline-v6.0-rc4 commit 0947ae1121083d363d522ff7518ee72b55bd8d29 category: bugfix issue: #ICK5MI CVE: CVE-2022-49967 Signed-off-by: Tengda Wu --------------------------------------- While reading bpf_jit_limit, it can be changed concurrently via sysctl, WRITE_ONCE() in __do_proc_doulongvec_minmax(). The size of bpf_jit_limit is long, so we need to add a paired READ_ONCE() to avoid load-tearing. Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220823215804.2177-1-kuniyu@amazon.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a1ee727a0d6e..464f4a13a3c5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -838,7 +838,7 @@ pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { + if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; -- Gitee From d1ecc652e67b9fdb22986911c46258aa2f0f35c8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Apr 2025 15:41:51 -0400 Subject: [PATCH 09/13] __legitimize_mnt(): check for MNT_SYNC_UMOUNT should be under mount_lock stable inclusion from stable-v5.10.238 commit b89eb56a378b7b2c1176787fc228d0a57172bdd5 category: bugfix issue: #ICK5MI CVE: CVE-2025-38058 Signed-off-by: Tengda Wu --------------------------------------- [ Upstream commit 250cf3693060a5f803c5f1ddc082bb06b16112a9 ] ... or we risk stealing final mntput from sync umount - raising mnt_count after umount(2) has verified that victim is not busy, but before it has set MNT_SYNC_UMOUNT; in that case __legitimize_mnt() doesn't see that it's safe to quietly undo mnt_count increment and leaves dropping the reference to caller, where it'll be a full-blown mntput(). Check under mount_lock is needed; leaving the current one done before taking that makes no sense - it's nowhere near common enough to bother with. Reviewed-by: Christian Brauner Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/namespace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 61c88343cc94..7091dfc57d4f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -573,12 +573,8 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) smp_mb(); // see mntput_no_expire() if (likely(!read_seqretry(&mount_lock, seq))) return 0; - if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { - mnt_add_count(mnt, -1); - return 1; - } lock_mount_hash(); - if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; -- Gitee From 78e17ae041239523bf90bb9dc73a5c4e57c3536d Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 9 Apr 2025 15:59:56 +0200 Subject: [PATCH 10/13] espintcp: fix skb leaks mainline inclusion from mainline-v6.15 commit 63c1f19a3be3169e51a5812d22a6d0c879414076 category: bugfix issue: #ICK5MI CVE: CVE-2025-38057 Signed-off-by: Tengda Wu --------------------------------------- A few error paths are missing a kfree_skb. Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 4 +++- net/ipv6/esp6.c | 4 +++- net/xfrm/espintcp.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 412a3c153cad..5e716ee4db83 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -198,8 +198,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) sk = esp_find_tcp_sk(x); err = PTR_ERR_OR_ZERO(sk); - if (err) + if (err) { + kfree_skb(skb); goto out; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index fddc811bbde1..5382beac4e3c 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -214,8 +214,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) sk = esp6_find_tcp_sk(x); err = PTR_ERR_OR_ZERO(sk); - if (err) + if (err) { + kfree_skb(skb); goto out; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 24ca49ecebea..b32a8db28084 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -170,8 +170,11 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) { struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&ctx->out_queue) >= + READ_ONCE(netdev_max_backlog)) { + kfree_skb(skb); return -ENOBUFS; + } __skb_queue_tail(&ctx->out_queue, skb); -- Gitee From b2b778aeb5f77f3d7acde991d8f250b78b990b98 Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Thu, 6 Mar 2025 16:12:21 -0500 Subject: [PATCH 11/13] rseq: Fix segfault on registration when rseq_cs is non-zero mainline inclusion from mainline-v6.15-rc1 commit fd881d0a085fc54354414aed990ccf05f282ba53 category: bugfix issue: #ICK5MI CVE: CVE-2025-38067 Signed-off-by: Tengda Wu --------------------------------------- The rseq_cs field is documented as being set to 0 by user-space prior to registration, however this is not currently enforced by the kernel. This can result in a segfault on return to user-space if the value stored in the rseq_cs field doesn't point to a valid struct rseq_cs. The correct solution to this would be to fail the rseq registration when the rseq_cs field is non-zero. However, some older versions of glibc will reuse the rseq area of previous threads without clearing the rseq_cs field and will also terminate the process if the rseq registration fails in a secondary thread. This wasn't caught in testing because in this case the leftover rseq_cs does point to a valid struct rseq_cs. What we can do is clear the rseq_cs field on registration when it's non-zero which will prevent segfaults on registration and won't break the glibc versions that reuse rseq areas on thread creation. Signed-off-by: Michael Jeanson Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250306211223.109455-1-mjeanson@efficios.com --- kernel/rseq.c | 60 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 6ca29dddceab..2f8a1c6ac0a3 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -112,6 +112,29 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) return 0; } +/* + * Get the user-space pointer value stored in the 'rseq_cs' field. + */ +static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +{ + if (!rseq_cs) + return -EFAULT; + +#ifdef CONFIG_64BIT + if (get_user(*rseq_cs, &rseq->rseq_cs)) + return -EFAULT; +#else + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) + return -EFAULT; +#endif + + return 0; +} + +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; @@ -120,17 +143,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; -#endif + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + if (ret) + return ret; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } + /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; @@ -199,7 +221,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) return !!(event_mask & ~flags); } -static int clear_rseq_cs(struct task_struct *t) +static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal @@ -210,9 +232,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif @@ -244,11 +266,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; - ret = clear_rseq_cs(t); + ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -324,6 +346,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -369,6 +392,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; + + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. + */ + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) + return -EFAULT; + if (rseq_cs && clear_rseq_cs(rseq)) + return -EFAULT; + current->rseq = rseq; current->rseq_sig = sig; /* -- Gitee From 523da3dfc69ddac40b1cf8bf1e4453ddfc473455 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:36 -0800 Subject: [PATCH 12/13] genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie mainline inclusion from mainline-v6.15-rc1 commit 1f7df3a691740a7736bbc99dc4ed536120eb4746 category: bugfix issue: #ICK5MI CVE: CVE-2025-38062 Signed-off-by: Tengda Wu --------------------------------------- The IOMMU translation for MSI message addresses has been a 2-step process, separated in time: 1) iommu_dma_prepare_msi(): A cookie pointer containing the IOVA address is stored in the MSI descriptor when an MSI interrupt is allocated. 2) iommu_dma_compose_msi_msg(): this cookie pointer is used to compute a translated message address. This has an inherent lifetime problem for the pointer stored in the cookie that must remain valid between the two steps. However, there is no locking at the irq layer that helps protect the lifetime. Today, this works under the assumption that the iommu domain is not changed while MSI interrupts being programmed. This is true for normal DMA API users within the kernel, as the iommu domain is attached before the driver is probed and cannot be changed while a driver is attached. Classic VFIO type1 also prevented changing the iommu domain while VFIO was running as it does not support changing the "container" after starting up. However, iommufd has improved this so that the iommu domain can be changed during VFIO operation. This potentially allows userspace to directly race VFIO_DEVICE_ATTACH_IOMMUFD_PT (which calls iommu_attach_group()) and VFIO_DEVICE_SET_IRQS (which calls into iommu_dma_compose_msi_msg()). This potentially causes both the cookie pointer and the unlocked call to iommu_get_domain_for_dev() on the MSI translation path to become UAFs. Fix the MSI cookie UAF by removing the cookie pointer. The translated IOVA address is already known during iommu_dma_prepare_msi() and cannot change. Thus, it can simply be stored as an integer in the MSI descriptor. The other UAF related to iommu_get_domain_for_dev() will be addressed in patch "iommu: Make iommu_dma_prepare_msi() into a generic operation" by using the IOMMU group mutex. Link: https://patch.msgid.link/r/a4f2cd76b9dc1833ee6c1cf325cba57def22231c.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- drivers/iommu/dma-iommu.c | 28 +++++++++++++--------------- include/linux/msi.h | 33 ++++++++++++--------------------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index d1539b7399a9..a435bc7431e3 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1244,7 +1244,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) static DEFINE_MUTEX(msi_prepare_lock); /* see below */ if (!domain || !domain->iova_cookie) { - desc->iommu_cookie = NULL; + msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } @@ -1256,29 +1256,27 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) mutex_lock(&msi_prepare_lock); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); mutex_unlock(&msi_prepare_lock); - - msi_desc_set_iommu_cookie(desc, msi_page); - if (!msi_page) return -ENOMEM; + + msi_desc_set_iommu_msi_iova( + desc, msi_page->iova, + ilog2(cookie_msi_granule(domain->iova_cookie))); return 0; } void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { - struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - const struct iommu_dma_msi_page *msi_page; +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - msi_page = msi_desc_get_iommu_cookie(desc); - - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) - return; - - msg->address_hi = upper_32_bits(msi_page->iova); - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; - msg->address_lo += lower_32_bits(msi_page->iova); + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); + } +#endif } static int iommu_dma_init(void) diff --git a/include/linux/msi.h b/include/linux/msi.h index 70c910b23e13..cbdaf98f2731 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -63,6 +63,10 @@ struct ti_sci_inta_msi_desc { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. + * Only used if iommu_msi_shift != 0 + * @iommu_msi_shift: Indicates how many bits of the original address should be + * preserved when using iommu_msi_iova. * * @write_msi_msg: Callback that may be called when the MSI message * address or data changes @@ -91,7 +95,8 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - const void *iommu_cookie; + u64 iommu_msi_iova : 58; + u64 iommu_msi_shift : 6; #endif void (*write_msi_msg)(struct msi_desc *entry, void *data); @@ -146,28 +151,14 @@ struct msi_desc { __irq < ((desc)->irq + (desc)->nvec_used); \ __irq++) -#ifdef CONFIG_IRQ_MSI_IOMMU -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return desc->iommu_cookie; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ - desc->iommu_cookie = iommu_cookie; -} -#else -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, + unsigned int msi_shift) { - return NULL; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ -} +#ifdef CONFIG_IRQ_MSI_IOMMU + desc->iommu_msi_iova = msi_iova >> msi_shift; + desc->iommu_msi_shift = msi_shift; #endif +} #ifdef CONFIG_PCI_MSI #define first_pci_msi_entry(pdev) first_msi_entry(&(pdev)->dev) -- Gitee From aca5a452c8e0119d2e7549d26cbab77083e92714 Mon Sep 17 00:00:00 2001 From: Hyejeong Choi Date: Mon, 12 May 2025 21:06:38 -0500 Subject: [PATCH 13/13] dma-buf: insert memory barrier before updating num_fences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.15-rc7 commit 72c7d62583ebce7baeb61acce6057c361f73be4a category: bugfix issue: #ICK5MI CVE: CVE-2025-38095 Signed-off-by: Tengda Wu --------------------------------------- smp_store_mb() inserts memory barrier after storing operation. It is different with what the comment is originally aiming so Null pointer dereference can be happened if memory update is reordered. Signed-off-by: Hyejeong Choi Fixes: a590d0fdbaa5 ("dma-buf: Update reservation shared_count after adding the new fence") CC: stable@vger.kernel.org Reviewed-by: Christian König Link: https://lore.kernel.org/r/20250513020638.GA2329653@au1-maretx-p37.eng.sarc.samsung.com Signed-off-by: Christian König --- drivers/dma-buf/dma-resv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index 1187e5e80ede..481f7c81c6dd 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -291,7 +291,8 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, struct dma_fence *fence) replace: RCU_INIT_POINTER(fobj->shared[i], fence); /* pointer update must be visible before we extend the shared_count */ - smp_store_mb(fobj->shared_count, count); + smp_wmb(); + fobj->shared_count = count; write_seqcount_end(&obj->seq); dma_fence_put(old); -- Gitee