From 7266b2298f4d63604d6caa845996a436e29eb535 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 8 Nov 2022 13:11:31 +0800 Subject: [PATCH 01/14] bpf: Fix memory leaks in __check_func_call mainline inclusion from mainline-v6.1-rc6 commit eb86559a691cea5fa63e57a03ec3dc9c31e97955 category: bugfix issue: #ICG2DK CVE: CVE-2022-49837 Signed-off-by: Tengda Wu --------------------------------------- kmemleak reports this issue: unreferenced object 0xffff88817139d000 (size 2048): comm "test_progs", pid 33246, jiffies 4307381979 (age 45851.820s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000045f075f0>] kmalloc_trace+0x27/0xa0 [<0000000098b7c90a>] __check_func_call+0x316/0x1230 [<00000000b4c3c403>] check_helper_call+0x172e/0x4700 [<00000000aa3875b7>] do_check+0x21d8/0x45e0 [<000000001147357b>] do_check_common+0x767/0xaf0 [<00000000b5a595b4>] bpf_check+0x43e3/0x5bc0 [<0000000011e391b1>] bpf_prog_load+0xf26/0x1940 [<0000000007f765c0>] __sys_bpf+0xd2c/0x3650 [<00000000839815d6>] __x64_sys_bpf+0x75/0xc0 [<00000000946ee250>] do_syscall_64+0x3b/0x90 [<0000000000506b7f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd The root case here is: In function prepare_func_exit(), the callee is not released in the abnormal scenario after "state->curframe--;". To fix, move "state->curframe--;" to the very bottom of the function, right when we free callee and reset frame[] pointer to NULL, as Andrii suggested. In addition, function __check_func_call() has a similar problem. In the abnormal scenario before "state->curframe++;", the callee also should be released by free_func_state(). Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Fixes: fd978bf7fd31 ("bpf: Add reference tracking to verifier") Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/1667884291-15666-1-git-send-email-wangyufen@huawei.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2780ccb4d10e..f063ab2058d2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5421,7 +5421,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Transfer references to the callee */ err = transfer_reference_state(callee, caller); if (err) - return err; + goto err_out; /* copy r1 - r5 args that callee can access. The copy includes parent * pointers, which connects us up to the liveness chain @@ -5444,6 +5444,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, print_verifier_state(env, callee); } return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -5466,8 +5471,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return -EINVAL; } - state->curframe--; - caller = state->frame[state->curframe]; + caller = state->frame[state->curframe - 1]; /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; @@ -5485,7 +5489,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* clear everything in the callee */ free_func_state(callee); - state->frame[state->curframe + 1] = NULL; + state->frame[state->curframe--] = NULL; return 0; } -- Gitee From dbd658245d7e1920d1122457afa76cf8fb9d9470 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 24 Feb 2025 09:55:14 -0800 Subject: [PATCH 02/14] bpf: Fix kmemleak warning for percpu hashmap mainline inclusion from mainline-v6.15-rc1 commit 11ba7ce076e5903e7bdc1fd1498979c331b3c286 category: bugfix issue: #ICG2DK CVE: CVE-2025-37807 Signed-off-by: Tengda Wu --------------------------------------- Vlad Poenaru reported the following kmemleak issue: unreferenced object 0x606fd7c44ac8 (size 32): backtrace (crc 0): pcpu_alloc_noprof+0x730/0xeb0 bpf_map_alloc_percpu+0x69/0xc0 prealloc_init+0x9d/0x1b0 htab_map_alloc+0x363/0x510 map_create+0x215/0x3a0 __sys_bpf+0x16b/0x3e0 __x64_sys_bpf+0x18/0x20 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Further investigation shows the reason is due to not 8-byte aligned store of percpu pointer in htab_elem_set_ptr(): *(void __percpu **)(l->key + key_size) = pptr; Note that the whole htab_elem alignment is 8 (for x86_64). If the key_size is 4, that means pptr is stored in a location which is 4 byte aligned but not 8 byte aligned. In mm/kmemleak.c, scan_block() scans the memory based on 8 byte stride, so it won't detect above pptr, hence reporting the memory leak. In htab_map_alloc(), we already have htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) htab->elem_size += sizeof(void *); else htab->elem_size += round_up(htab->map.value_size, 8); So storing pptr with 8-byte alignment won't cause any problem and can fix kmemleak too. The issue can be reproduced with bpf selftest as well: 1. Enable CONFIG_DEBUG_KMEMLEAK config 2. Add a getchar() before skel destroy in test_hash_map() in prog_tests/for_each.c. The purpose is to keep map available so kmemleak can be detected. 3. run './test_progs -t for_each/hash_map &' and a kmemleak should be reported. Reported-by: Vlad Poenaru Signed-off-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250224175514.2207227-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d05614a2bdc7..c8cf43e38d4f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -184,12 +184,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab) static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)(l->key + roundup(key_size, 8)); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) -- Gitee From 345dfda8987bedb40be2235b59b6262cccee88bb Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 25 Feb 2025 10:14:40 +0800 Subject: [PATCH 03/14] PCI: Fix reference leak in pci_register_host_bridge() stable inclusion from stable-v5.10.237 commit f4db1b2c9ae3d013733c302ee70cac943b7070c0 category: bugfix issue: #ICG2DK CVE: CVE-2025-37836 Signed-off-by: Tengda Wu --------------------------------------- [ Upstream commit 804443c1f27883926de94c849d91f5b7d7d696e9 ] If device_register() fails, call put_device() to give up the reference to avoid a memory leak, per the comment at device_register(). Found by code review. Link: https://lore.kernel.org/r/20250225021440.3130264-1-make24@iscas.ac.cn Fixes: 37d6a0a6f470 ("PCI: Add pci_register_host_bridge() interface") Signed-off-by: Ma Ke [bhelgaas: squash Dan Carpenter's double free fix from https://lore.kernel.org/r/db806a6c-a91b-4e5a-a84b-6b7e01bdac85@stanley.mountain] Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- drivers/pci/probe.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0187fe60b7d8..7cab30345acf 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -881,6 +881,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) resource_size_t offset; LIST_HEAD(resources); struct resource *res; + bool bus_registered = false; char addr[64], *fmt; const char *name; int err; @@ -937,6 +938,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) name = dev_name(&bus->dev); err = device_register(&bus->dev); + bus_registered = true; if (err) goto unregister; @@ -996,7 +998,11 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) device_del(&bridge->dev); free: - kfree(bus); + if (bus_registered) + put_device(&bus->dev); + else + kfree(bus); + return err; } -- Gitee From 39d84661739ecf99178c18b7c19b59a1e71b88a2 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 7 Nov 2022 19:04:50 +0800 Subject: [PATCH 04/14] tracing: Fix memory leak in tracing_read_pipe() mainline inclusion from mainline-v6.1-rc6 commit 649e72070cbbb8600eb823833e4748f5a0815116 category: bugfix issue: #ICG2DK CVE: CVE-2022-49801 Signed-off-by: Tengda Wu --------------------------------------- kmemleak reports this issue: unreferenced object 0xffff888105a18900 (size 128): comm "test_progs", pid 18933, jiffies 4336275356 (age 22801.766s) hex dump (first 32 bytes): 25 73 00 90 81 88 ff ff 26 05 00 00 42 01 58 04 %s......&...B.X. 03 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000560143a1>] __kmalloc_node_track_caller+0x4a/0x140 [<000000006af00822>] krealloc+0x8d/0xf0 [<00000000c309be6a>] trace_iter_expand_format+0x99/0x150 [<000000005a53bdb6>] trace_check_vprintf+0x1e0/0x11d0 [<0000000065629d9d>] trace_event_printf+0xb6/0xf0 [<000000009a690dc7>] trace_raw_output_bpf_trace_printk+0x89/0xc0 [<00000000d22db172>] print_trace_line+0x73c/0x1480 [<00000000cdba76ba>] tracing_read_pipe+0x45c/0x9f0 [<0000000015b58459>] vfs_read+0x17b/0x7c0 [<000000004aeee8ed>] ksys_read+0xed/0x1c0 [<0000000063d3d898>] do_syscall_64+0x3b/0x90 [<00000000a06dda7f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd iter->fmt alloced in tracing_read_pipe() -> .. ->trace_iter_expand_format(), but not freed, to fix, add free in tracing_release_pipe() Link: https://lkml.kernel.org/r/1667819090-4643-1-git-send-email-wangyufen@huawei.com Cc: stable@vger.kernel.org Fixes: efbbdaa22bb7 ("tracing: Show real address for trace event arguments") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Wang Yufen Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ff88eb2850d..518abcd89296 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6389,6 +6389,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); + kfree(iter->fmt); kfree(iter->temp); mutex_destroy(&iter->mutex); kfree(iter); -- Gitee From 7190159998deec3e02621ac47e6736c368deea06 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 5 May 2025 21:58:04 +0200 Subject: [PATCH 05/14] bpf: Scrub packet on bpf_redirect_peer mainline inclusion from mainline-v6.15-rc6 commit c4327229948879814229b46aa26a750718888503 category: bugfix issue: #ICG2DK CVE: CVE-2025-37959 Signed-off-by: Tengda Wu --------------------------------------- When bpf_redirect_peer is used to redirect packets to a device in another network namespace, the skb isn't scrubbed. That can lead skb information from one namespace to be "misused" in another namespace. As one example, this is causing Cilium to drop traffic when using bpf_redirect_peer to redirect packets that just went through IPsec decryption to a container namespace. The following pwru trace shows (1) the packet path from the host's XFRM layer to the container's XFRM layer where it's dropped and (2) the number of active skb extensions at each function. NETNS MARK IFACE TUPLE FUNC 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 xfrm_rcv_cb .active_extensions = (__u8)2, 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 xfrm4_rcv_cb .active_extensions = (__u8)2, 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 gro_cells_receive .active_extensions = (__u8)2, [...] 4026533547 0 eth0 10.244.3.124:35473->10.244.2.158:53 skb_do_redirect .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 ip_rcv .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 ip_rcv_core .active_extensions = (__u8)2, [...] 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 udp_queue_rcv_one_skb .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 __xfrm_policy_check .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 __xfrm_decode_session .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 security_xfrm_decode_session .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 kfree_skb_reason(SKB_DROP_REASON_XFRM_POLICY) .active_extensions = (__u8)2, In this case, there are no XFRM policies in the container's network namespace so the drop is unexpected. When we decrypt the IPsec packet, the XFRM state used for decryption is set in the skb extensions. This information is preserved across the netns switch. When we reach the XFRM policy check in the container's netns, __xfrm_policy_check drops the packet with LINUX_MIB_XFRMINNOPOLS because a (container-side) XFRM policy can't be found that matches the (host-side) XFRM state used for decryption. This patch fixes this by scrubbing the packet when using bpf_redirect_peer, as is done on typical netns switches via veth devices except skb->mark and skb->tstamp are not zeroed. Fixes: 9aa1206e8f482 ("bpf: Add redirect_peer helper") Signed-off-by: Paul Chaignon Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/1728ead5e0fe45e7a6542c36bd4e3ca07a73b7d6.1746460653.git.paul.chaignon@gmail.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index 6322613a421d..7b4cf9537f2d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2500,6 +2500,7 @@ int skb_do_redirect(struct sk_buff *skb) net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; + skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? -- Gitee From 959c3a8f24cfe7011e37a889bcdeb172af79cc55 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Sat, 12 Apr 2025 17:25:54 +0800 Subject: [PATCH 06/14] block: fix resource leak in blk_register_queue() error path mainline inclusion from mainline-v6.15-rc3 commit 40f2eb9b531475dd01b683fdaf61ca3cfd03a51e category: bugfix issue: #ICG2DK CVE: CVE-2025-37980 Signed-off-by: Tengda Wu --------------------------------------- When registering a queue fails after blk_mq_sysfs_register() is successful but the function later encounters an error, we need to clean up the blk_mq_sysfs resources. Add the missing blk_mq_sysfs_unregister() call in the error path to properly clean up these resources and prevent a memory leak. Fixes: 320ae51feed5 ("blk-mq: new multi-queue block IO queueing mechanism") Signed-off-by: Zheng Qixing Reviewed-by: Christoph Hellwig Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250412092554.475218-1-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8e0827c30c46..91fd9ee7fe22 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -892,6 +892,8 @@ int blk_register_queue(struct gendisk *disk) if (ret) { mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); + if (queue_is_mq(q)) + blk_mq_unregister_dev(dev, q); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); -- Gitee From 64b32c738757ee523b37859a2892a5caada18789 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 13 Mar 2022 21:15:02 -1000 Subject: [PATCH 07/14] block: fix rq-qos breakage from skipping rq_qos_done_bio() mainline inclusion from mainline-v5.18-rc1 commit aa1b46dcdc7baaf5fec0be25782ef24b26aa209e category: bugfix issue: #ICG2DK CVE: CVE-2022-49266 Signed-off-by: Tengda Wu --------------------------------------- a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") made bio_endio() skip rq_qos_done_bio() if BIO_TRACKED is not set. While this fixed a potential oops, it also broke blk-iocost by skipping the done_bio callback for merged bios. Before, whether a bio goes through rq_qos_throttle() or rq_qos_merge(), rq_qos_done_bio() would be called on the bio on completion with BIO_TRACKED distinguishing the former from the latter. rq_qos_done_bio() is not called for bios which wenth through rq_qos_merge(). This royally confuses blk-iocost as the merged bios never finish and are considered perpetually in-flight. One reliably reproducible failure mode is an intermediate cgroup geting stuck active preventing its children from being activated due to the leaf-only rule, leading to loss of control. The following is from resctl-bench protection scenario which emulates isolating a web server like workload from a memory bomb run on an iocost configuration which should yield a reasonable level of protection. # cat /sys/block/nvme2n1/device/model Samsung SSD 970 PRO 512GB # cat /sys/fs/cgroup/io.cost.model 259:0 ctrl=user model=linear rbps=834913556 rseqiops=93622 rrandiops=102913 wbps=618985353 wseqiops=72325 wrandiops=71025 # cat /sys/fs/cgroup/io.cost.qos 259:0 enable=1 ctrl=user rpct=95.00 rlat=18776 wpct=95.00 wlat=8897 min=60.00 max=100.00 # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=242u:336u/2.5m p90=794u:1.4m/7.5m p99=2.7m:8.0m/62.5m max=8.0m:36.4m/350m W p50=221u:323u/1.5m p90=709u:1.2m/5.5m p99=1.5m:2.5m/9.5m max=6.9m:35.9m/350m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 15.90 15.90 15.90 40.05 57.24 59.07 60.01 74.63 74.63 90.35 90.35 58.12 15.82 lat-imp% 0 0 0 0 0 4.55 14.68 15.54 233.5 548.1 548.1 53.88 143.6 Result: isol=58.12:15.82% lat_imp=53.88%:143.6 work_csv=100.0% missing=3.96% The isolation result of 58.12% is close to what this device would show without any IO control. Fix it by introducing a new flag BIO_QOS_MERGED to mark merged bios and calling rq_qos_done_bio() on them too. For consistency and clarity, rename BIO_TRACKED to BIO_QOS_THROTTLED. The flag checks are moved into rq_qos_done_bio() so that it's next to the code paths that set the flags. With the patch applied, the above same benchmark shows: # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=123u:84.4u/985u p90=322u:256u/2.5m p99=1.6m:1.4m/9.5m max=11.1m:36.0m/350m W p50=429u:274u/995u p90=1.7m:1.3m/4.5m p99=3.4m:2.7m/11.5m max=7.9m:5.9m/26.5m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 84.91 84.91 89.51 90.73 92.31 94.49 96.36 98.04 98.71 100.0 100.0 94.42 2.81 lat-imp% 0 0 0 0 0 2.81 5.73 11.11 13.92 17.53 22.61 4.10 4.68 Result: isol=94.42:2.81% lat_imp=4.10%:4.68 work_csv=58.34% missing=0% Signed-off-by: Tejun Heo Fixes: a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") Cc: stable@vger.kernel.org # v5.15+ Cc: Ming Lei Cc: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/Yi7rdrzQEHjJLGKB@slm.duckdns.org Signed-off-by: Jens Axboe --- block/bio.c | 3 +-- block/blk-iolatency.c | 2 +- block/blk-rq-qos.h | 23 +++++++++++++---------- include/linux/blk_types.h | 3 ++- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3caf222b50d5..fdaf42db55af 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1431,8 +1431,7 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk && bio_flagged(bio, BIO_TRACKED)) - rq_qos_done_bio(bio->bi_disk->queue, bio); + rq_qos_done_bio(bio); /* * Need to have a real endio function for chained bios, otherwise diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 74511a060d59..5fd3f3a8f88c 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -601,7 +601,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) int inflight = 0; blkg = bio->bi_blkg; - if (!blkg || !bio_flagged(bio, BIO_TRACKED)) + if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) return; iolat = blkg_to_lat(bio->bi_blkg); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bcb3495e376..db83d01b62d0 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -189,21 +189,22 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) __rq_qos_requeue(q->rq_qos, rq); } -static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +static inline void rq_qos_done_bio(struct bio *bio) { - if (q->rq_qos) - __rq_qos_done_bio(q->rq_qos, bio); + if (bio->bi_disk && (bio_flagged(bio, BIO_QOS_THROTTLED) || + bio_flagged(bio, BIO_QOS_MERGED))) { + struct request_queue *q = bio->bi_disk->queue; + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); + } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - /* - * BIO_TRACKED lets controllers know that a bio went through the - * normal rq_qos path. - */ - bio_set_flag(bio, BIO_TRACKED); - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); + } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, @@ -216,8 +217,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); + } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..afe45c26501a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -283,7 +283,8 @@ enum { BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ - BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ + BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_FLAG_LAST }; -- Gitee From fe28810789ebe71d6cf210e97730df2a861feeec Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 18 Mar 2025 13:22:55 +0530 Subject: [PATCH 08/14] ext4: define ext4_journal_destroy wrapper mainline inclusion from mainline-v6.15-rc1 commit 5a02a6204ca37e7c22fbb55a789c503f05e8e89a category: bugfix issue: #ICG2DK CVE: CVE-2025-22113 Signed-off-by: Tengda Wu --------------------------------------- Define an ext4 wrapper over jbd2_journal_destroy to make sure we have consistent behavior during journal destruction. This will also come useful in the next patch where we add some ext4 specific logic in the destroy path. Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/c3ba78c5c419757e6d5f2d8ebb4a8ce9d21da86a.1742279837.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.h | 14 ++++++++++++++ fs/ext4/super.c | 13 +++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b9881ee1bf93..2a56017df396 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -510,4 +510,18 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8919c5d1a9cb..413765841d6f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1189,8 +1189,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } @@ -5170,8 +5169,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_journal) { /* flush s_error_work before journal destroy. */ flush_work(&sbi->s_error_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); @@ -5384,7 +5382,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: ext4_blkdev_put(bdev); return NULL; @@ -5484,8 +5482,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -5503,7 +5500,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } -- Gitee From 22326c501b81ca15793ae8ae73f3a5ea2946526d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 18 Mar 2025 13:22:56 +0530 Subject: [PATCH 09/14] ext4: avoid journaling sb update on error if journal is destroying mainline inclusion from mainline-v6.15-rc1 commit ce2f26e73783b4a7c46a86e3af5b5c8de0971790 category: bugfix issue: #ICG2DK CVE: CVE-2025-22113 Signed-off-by: Tengda Wu --------------------------------------- Presently we always BUG_ON if trying to start a transaction on a journal marked with JBD2_UNMOUNT, since this should never happen. However, while ltp running stress tests, it was observed that in case of some error handling paths, it is possible for update_super_work to start a transaction after the journal is destroyed eg: (umount) ext4_kill_sb kill_block_super generic_shutdown_super sync_filesystem /* commits all txns */ evict_inodes /* might start a new txn */ ext4_put_super flush_work(&sbi->s_sb_upd_work) /* flush the workqueue */ jbd2_journal_destroy journal_kill_thread journal->j_flags |= JBD2_UNMOUNT; jbd2_journal_commit_transaction jbd2_journal_get_descriptor_buffer jbd2_journal_bmap ext4_journal_bmap ext4_map_blocks ... ext4_inode_error ext4_handle_error schedule_work(&sbi->s_sb_upd_work) /* work queue kicks in */ update_super_work jbd2_journal_start start_this_handle BUG_ON(journal->j_flags & JBD2_UNMOUNT) Hence, introduce a new mount flag to indicate journal is destroying and only do a journaled (and deferred) update of sb if this flag is not set. Otherwise, just fallback to an un-journaled commit. Further, in the journal destroy path, we have the following sequence: 1. Set mount flag indicating journal is destroying 2. force a commit and wait for it 3. flush pending sb updates This sequence is important as it ensures that, after this point, there is no sb update that might be journaled so it is safe to update the sb outside the journal. (To avoid race discussed in 2d01ddc86606) Also, we don't need a similar check in ext4_grp_locked_error since it is only called from mballoc and AFAICT it would be always valid to schedule work here. Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available") Reported-by: Mahesh Kumar Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://patch.msgid.link/9613c465d6ff00cd315602f99283d5f24018c3f7.1742279837.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/ext4_jbd2.h | 15 +++++++++++++++ fs/ext4/super.c | 14 +++++++++----- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e3bb368a1559..1399b7cd6803 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1710,9 +1710,10 @@ enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ - EXT4_MF_FC_COMMITTING /* File system underoing a fast + EXT4_MF_FC_COMMITTING, /* File system underoing a fast * commit. */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2a56017df396..53f81ca2b076 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -518,6 +518,21 @@ static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *jour { int err = 0; + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_error_work); + err = jbd2_journal_destroy(journal); sbi->s_journal = NULL; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 413765841d6f..cf197f3791eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -653,9 +653,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_error_work); else ext4_commit_super(sb); @@ -1184,7 +1188,6 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); - flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { @@ -1193,7 +1196,9 @@ static void ext4_put_super(struct super_block *sb) if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_error_work); + ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); @@ -5168,7 +5173,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_journal) { /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: -- Gitee From 2dd90fbc84220894f07ed892ddfcd1338f0b7eb4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 6 Dec 2024 12:40:11 +0000 Subject: [PATCH 10/14] net: stmmac: fix TSO DMA API usage causing oops mainline inclusion from mainline-v6.13-rc3 commit 4c49f38e20a57f8abaebdf95b369295b153d1f8e category: bugfix issue: #ICG2DK CVE: CVE-2024-56719 Signed-off-by: Tengda Wu --------------------------------------- Commit 66600fac7a98 ("net: stmmac: TSO: Fix unbalanced DMA map/unmap for non-paged SKB data") moved the assignment of tx_skbuff_dma[]'s members to be later in stmmac_tso_xmit(). The buf (dma cookie) and len stored in this structure are passed to dma_unmap_single() by stmmac_tx_clean(). The DMA API requires that the dma cookie passed to dma_unmap_single() is the same as the value returned from dma_map_single(). However, by moving the assignment later, this is not the case when priv->dma_cap.addr64 > 32 as "des" is offset by proto_hdr_len. This causes problems such as: dwc-eth-dwmac 2490000.ethernet eth0: Tx DMA map failed and with DMA_API_DEBUG enabled: DMA-API: dwc-eth-dwmac 2490000.ethernet: device driver tries to +free DMA memory it has not allocated [device address=0x000000ffffcf65c0] [size=66 bytes] Fix this by maintaining "des" as the original DMA cookie, and use tso_des to pass the offset DMA cookie to stmmac_tso_allocator(). Full details of the crashes can be found at: https://lore.kernel.org/all/d8112193-0386-4e14-b516-37c2d838171a@nvidia.com/ https://lore.kernel.org/all/klkzp5yn5kq5efgtrow6wbvnc46bcqfxs65nz3qy77ujr5turc@bwwhelz2l4dw/ Reported-by: Jon Hunter Reported-by: Thierry Reding Fixes: 66600fac7a98 ("net: stmmac: TSO: Fix unbalanced DMA map/unmap for non-paged SKB data") Tested-by: Jon Hunter Signed-off-by: Russell King (Oracle) Reviewed-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/E1tJXcx-006N4Z-PC@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 650c7d4be82f..96ac731fd478 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3180,9 +3180,9 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) unsigned int first_entry, tx_packets; struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; + dma_addr_t tso_des, des; u8 proto_hdr_len, hdr; u32 pay_len, mss; - dma_addr_t des; int i; tx_q = &priv->tx_queue[queue]; @@ -3269,14 +3269,15 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) /* If needed take extra descriptors to fill the remaining payload */ tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE; + tso_des = des; } else { stmmac_set_desc_addr(priv, first, des); tmp_pay_len = pay_len; - des += proto_hdr_len; + tso_des = des + proto_hdr_len; pay_len = 0; } - stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue); + stmmac_tso_allocator(priv, tso_des, tmp_pay_len, (nfrags == 0), queue); /* Prepare fragments */ for (i = 0; i < nfrags; i++) { -- Gitee From 411032872c69717886f89692cf18f4f873c5b7fa Mon Sep 17 00:00:00 2001 From: Hyeong-Jun Kim Date: Tue, 2 Nov 2021 16:10:02 +0900 Subject: [PATCH 11/14] f2fs: invalidate META_MAPPING before IPU/DIO write mainline inclusion from mainline-v5.16-rc1 commit e3b49ea36802053f312013fd4ccb6e59920a9f76 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- Encrypted pages during GC are read and cached in META_MAPPING. However, due to cached pages in META_MAPPING, there is an issue where newly written pages are lost by IPU or DIO writes. Thread A - f2fs_gc() Thread B /* phase 3 */ down_write(i_gc_rwsem) ra_data_block() ---- (a) up_write(i_gc_rwsem) f2fs_direct_IO() : - down_read(i_gc_rwsem) - __blockdev_direct_io() - get_data_block_dio_write() - f2fs_dio_submit_bio() ---- (b) - up_read(i_gc_rwsem) /* phase 4 */ down_write(i_gc_rwsem) move_data_block() ---- (c) up_write(i_gc_rwsem) (a) In phase 3 of f2fs_gc(), up-to-date page is read from storage and cached in META_MAPPING. (b) In thread B, writing new data by IPU or DIO write on same blkaddr as read in (a). cached page in META_MAPPING become out-dated. (c) In phase 4 of f2fs_gc(), out-dated page in META_MAPPING is copied to new blkaddr. In conclusion, the newly written data in (b) is lost. To address this issue, invalidating pages in META_MAPPING before IPU or DIO write. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Signed-off-by: Hyeong-Jun Kim Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: May27May Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 5 ++++- fs/f2fs/segment.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fd8722da9c4f..709323fb5a02 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1719,9 +1719,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, sync_out: /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk); + } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2a54d7b89dd6..73fffb11d42c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3725,6 +3725,9 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return -EFSCORRUPTED; } + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) -- Gitee From 72854375427192522338a89fa75186f5d9ee97c0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Jul 2022 14:30:15 +0800 Subject: [PATCH 12/14] f2fs: fix to invalidate META_MAPPING before DIO write mainline inclusion from mainline-v6.0-rc1 commit 67ca06872eb02944b4c6f92cffa9242e92c63109 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- Quoted from commit e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") " Encrypted pages during GC are read and cached in META_MAPPING. However, due to cached pages in META_MAPPING, there is an issue where newly written pages are lost by IPU or DIO writes. Thread A - f2fs_gc() Thread B /* phase 3 */ down_write(i_gc_rwsem) ra_data_block() ---- (a) up_write(i_gc_rwsem) f2fs_direct_IO() : - down_read(i_gc_rwsem) - __blockdev_direct_io() - get_data_block_dio_write() - f2fs_dio_submit_bio() ---- (b) - up_read(i_gc_rwsem) /* phase 4 */ down_write(i_gc_rwsem) move_data_block() ---- (c) up_write(i_gc_rwsem) (a) In phase 3 of f2fs_gc(), up-to-date page is read from storage and cached in META_MAPPING. (b) In thread B, writing new data by IPU or DIO write on same blkaddr as read in (a). cached page in META_MAPPING become out-dated. (c) In phase 4 of f2fs_gc(), out-dated page in META_MAPPING is copied to new blkaddr. In conclusion, the newly written data in (b) is lost. To address this issue, invalidating pages in META_MAPPING before IPU or DIO write. " In previous commit, we missed to cover extent cache hit case, and passed wrong value for parameter @end of invalidate_mapping_pages(), fix both issues. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Fixes: e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") Cc: Hyeong-Jun Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: May27May Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 709323fb5a02..0f33084e149e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1540,9 +1540,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, *map->m_next_extent = pgofs + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) + if (flag == F2FS_GET_BLOCK_DIO) { f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk + map->m_len - 1); + } goto out; } @@ -1723,7 +1726,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk); + map->m_pblk, map->m_pblk + map->m_len - 1); } if (flag == F2FS_GET_BLOCK_PRECACHE) { -- Gitee From c9240fdc752ae8819fd8d69634a6ce8fc57f3300 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jul 2022 23:26:43 +0800 Subject: [PATCH 13/14] f2fs: invalidate meta pages only for post_read required inode mainline inclusion from mainline-v6.0-rc1 commit 0d5b9d8156396bbe1c982708b38ab9e188c45ec9 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- After commit e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write"), invalidate_mapping_pages() will be called to avoid race condition in between IPU/DIO and readahead for GC. However, readahead flow is only used for post_read required inode, so this patch adds check condition to avoids unnecessary page cache invalidating for non-post_read inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: May27May Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 11 +++-------- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 9 ++++++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0f33084e149e..8b8981d426cc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1540,12 +1540,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, *map->m_next_extent = pgofs + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) { + if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); - invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk + map->m_len - 1); - } goto out; } @@ -1722,12 +1719,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, sync_out: /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); - invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk + map->m_len - 1); - } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { @@ -2809,6 +2803,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .submitted = false, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, + .post_read = f2fs_post_read_required(inode), .io_type = io_type, .io_wbc = wbc, .bio = bio, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 702ac7d98333..82a381d62252 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1143,6 +1143,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ int compr_blocks; /* # of compressed block addresses */ bool encrypted; /* indicate file is encrypted */ + bool post_read; /* require post read */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 73fffb11d42c..7a7cc9630362 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3725,7 +3725,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return -EFSCORRUPTED; } - invalidate_mapping_pages(META_MAPPING(sbi), + if (fio->post_read) + invalidate_mapping_pages(META_MAPPING(sbi), fio->new_blkaddr, fio->new_blkaddr); stat_inc_inplace_blocks(fio->sbi); @@ -3894,10 +3895,16 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; + if (!f2fs_post_read_required(inode)) + return; + for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); + + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) -- Gitee From ae29724686d0fa43b9af939c2265e9d29793ebae Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 Mar 2024 09:08:34 +0800 Subject: [PATCH 14/14] f2fs: fix to truncate meta inode pages forcely stable inclusion from stable-v6.6.23 commit c92f2927df860a60ba815d3ee610a944b92a8694 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- [ Upstream commit 9f0c4a46be1fe9b97dbe66d49204c1371e3ece65 ] Below race case can cause data corruption: Thread A GC thread - gc_data_segment - ra_data_block - locked meta_inode page - f2fs_inplace_write_data - invalidate_mapping_pages : fail to invalidate meta_inode page due to lock failure or dirty|writeback status - f2fs_submit_page_bio : write last dirty data to old blkaddr - move_data_block - load old data from meta_inode page - f2fs_submit_page_write : write old data to new blkaddr Because invalidate_mapping_pages() will skip invalidating page which has unclear status including locked, dirty, writeback and so on, so we need to use truncate_inode_pages_range() instead of invalidate_mapping_pages() to make sure meta_inode page will be dropped. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Fixes: e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin Signed-off-by: May27May Conflicts: fs/f2fs/f2fs.h --- fs/f2fs/checkpoint.c | 5 +++-- fs/f2fs/f2fs.h | 26 ++++++++++++++++++++++++++ fs/f2fs/segment.c | 5 ++--- include/linux/f2fs_fs.h | 1 + 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8ca549cc975e..8a94bf3611d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1551,8 +1551,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) */ if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || f2fs_sb_has_compression(sbi)) - invalidate_mapping_pages(META_MAPPING(sbi), - MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); + f2fs_bug_on(sbi, + invalidate_inode_pages2_range(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1)); f2fs_release_ino_entry(sbi, false); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 82a381d62252..80c79159d7b6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4187,6 +4187,32 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int cnt) +{ + bool need_submit = false; + int i = 0; + + do { + struct page *page; + + page = find_get_page(META_MAPPING(sbi), blkaddr + i); + if (page) { + if (PageWriteback(page)) + need_submit = true; + f2fs_put_page(page, 0); + } + } while (++i < cnt && !need_submit); + + if (need_submit) + f2fs_submit_merged_write_cond(sbi, sbi->meta_inode, + NULL, 0, DATA); + + truncate_inode_pages_range(META_MAPPING(sbi), + F2FS_BLK_TO_BYTES((loff_t)blkaddr), + F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1))); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7a7cc9630362..f76cab61c0ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3726,8 +3726,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) } if (fio->post_read) - invalidate_mapping_pages(META_MAPPING(sbi), - fio->new_blkaddr, fio->new_blkaddr); + f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); stat_inc_inplace_blocks(fio->sbi); @@ -3904,7 +3903,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); - invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); + f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index a5dbb57a687f..7fd4bea6be64 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -27,6 +27,7 @@ #define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS) #define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS) +#define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -- Gitee