From dfe02050c3ac0102655fd22bebc4c9a581e1f2b7 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 8 Nov 2022 13:11:31 +0800 Subject: [PATCH 01/17] bpf: Fix memory leaks in __check_func_call mainline inclusion from mainline-v6.1-rc6 commit eb86559a691cea5fa63e57a03ec3dc9c31e97955 category: bugfix issue: #ICG2DK CVE: CVE-2022-49837 Signed-off-by: Tengda Wu --------------------------------------- kmemleak reports this issue: unreferenced object 0xffff88817139d000 (size 2048): comm "test_progs", pid 33246, jiffies 4307381979 (age 45851.820s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000045f075f0>] kmalloc_trace+0x27/0xa0 [<0000000098b7c90a>] __check_func_call+0x316/0x1230 [<00000000b4c3c403>] check_helper_call+0x172e/0x4700 [<00000000aa3875b7>] do_check+0x21d8/0x45e0 [<000000001147357b>] do_check_common+0x767/0xaf0 [<00000000b5a595b4>] bpf_check+0x43e3/0x5bc0 [<0000000011e391b1>] bpf_prog_load+0xf26/0x1940 [<0000000007f765c0>] __sys_bpf+0xd2c/0x3650 [<00000000839815d6>] __x64_sys_bpf+0x75/0xc0 [<00000000946ee250>] do_syscall_64+0x3b/0x90 [<0000000000506b7f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd The root case here is: In function prepare_func_exit(), the callee is not released in the abnormal scenario after "state->curframe--;". To fix, move "state->curframe--;" to the very bottom of the function, right when we free callee and reset frame[] pointer to NULL, as Andrii suggested. In addition, function __check_func_call() has a similar problem. In the abnormal scenario before "state->curframe++;", the callee also should be released by free_func_state(). Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Fixes: fd978bf7fd31 ("bpf: Add reference tracking to verifier") Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/1667884291-15666-1-git-send-email-wangyufen@huawei.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2780ccb4d10e..f063ab2058d2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5421,7 +5421,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Transfer references to the callee */ err = transfer_reference_state(callee, caller); if (err) - return err; + goto err_out; /* copy r1 - r5 args that callee can access. The copy includes parent * pointers, which connects us up to the liveness chain @@ -5444,6 +5444,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, print_verifier_state(env, callee); } return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -5466,8 +5471,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return -EINVAL; } - state->curframe--; - caller = state->frame[state->curframe]; + caller = state->frame[state->curframe - 1]; /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; @@ -5485,7 +5489,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* clear everything in the callee */ free_func_state(callee); - state->frame[state->curframe + 1] = NULL; + state->frame[state->curframe--] = NULL; return 0; } -- Gitee From 4d027d7fec68e1efa18ed1fd3511652453c4607e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 24 Feb 2025 09:55:14 -0800 Subject: [PATCH 02/17] bpf: Fix kmemleak warning for percpu hashmap mainline inclusion from mainline-v6.15-rc1 commit 11ba7ce076e5903e7bdc1fd1498979c331b3c286 category: bugfix issue: #ICG2DK CVE: CVE-2025-37807 Signed-off-by: Tengda Wu --------------------------------------- Vlad Poenaru reported the following kmemleak issue: unreferenced object 0x606fd7c44ac8 (size 32): backtrace (crc 0): pcpu_alloc_noprof+0x730/0xeb0 bpf_map_alloc_percpu+0x69/0xc0 prealloc_init+0x9d/0x1b0 htab_map_alloc+0x363/0x510 map_create+0x215/0x3a0 __sys_bpf+0x16b/0x3e0 __x64_sys_bpf+0x18/0x20 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Further investigation shows the reason is due to not 8-byte aligned store of percpu pointer in htab_elem_set_ptr(): *(void __percpu **)(l->key + key_size) = pptr; Note that the whole htab_elem alignment is 8 (for x86_64). If the key_size is 4, that means pptr is stored in a location which is 4 byte aligned but not 8 byte aligned. In mm/kmemleak.c, scan_block() scans the memory based on 8 byte stride, so it won't detect above pptr, hence reporting the memory leak. In htab_map_alloc(), we already have htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) htab->elem_size += sizeof(void *); else htab->elem_size += round_up(htab->map.value_size, 8); So storing pptr with 8-byte alignment won't cause any problem and can fix kmemleak too. The issue can be reproduced with bpf selftest as well: 1. Enable CONFIG_DEBUG_KMEMLEAK config 2. Add a getchar() before skel destroy in test_hash_map() in prog_tests/for_each.c. The purpose is to keep map available so kmemleak can be detected. 3. run './test_progs -t for_each/hash_map &' and a kmemleak should be reported. Reported-by: Vlad Poenaru Signed-off-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250224175514.2207227-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d05614a2bdc7..c8cf43e38d4f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -184,12 +184,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab) static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)(l->key + roundup(key_size, 8)); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) -- Gitee From 91d0f90e309a48db192e33ebc1b2eb0e1e366ce2 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 25 Feb 2025 10:14:40 +0800 Subject: [PATCH 03/17] PCI: Fix reference leak in pci_register_host_bridge() stable inclusion from stable-v5.10.237 commit f4db1b2c9ae3d013733c302ee70cac943b7070c0 category: bugfix issue: #ICG2DK CVE: CVE-2025-37836 Signed-off-by: Tengda Wu --------------------------------------- [ Upstream commit 804443c1f27883926de94c849d91f5b7d7d696e9 ] If device_register() fails, call put_device() to give up the reference to avoid a memory leak, per the comment at device_register(). Found by code review. Link: https://lore.kernel.org/r/20250225021440.3130264-1-make24@iscas.ac.cn Fixes: 37d6a0a6f470 ("PCI: Add pci_register_host_bridge() interface") Signed-off-by: Ma Ke [bhelgaas: squash Dan Carpenter's double free fix from https://lore.kernel.org/r/db806a6c-a91b-4e5a-a84b-6b7e01bdac85@stanley.mountain] Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- drivers/pci/probe.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0187fe60b7d8..7cab30345acf 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -881,6 +881,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) resource_size_t offset; LIST_HEAD(resources); struct resource *res; + bool bus_registered = false; char addr[64], *fmt; const char *name; int err; @@ -937,6 +938,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) name = dev_name(&bus->dev); err = device_register(&bus->dev); + bus_registered = true; if (err) goto unregister; @@ -996,7 +998,11 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) device_del(&bridge->dev); free: - kfree(bus); + if (bus_registered) + put_device(&bus->dev); + else + kfree(bus); + return err; } -- Gitee From 99999c30989541951cbac7ca731426d6ce2feaea Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 7 Nov 2022 19:04:50 +0800 Subject: [PATCH 04/17] tracing: Fix memory leak in tracing_read_pipe() mainline inclusion from mainline-v6.1-rc6 commit 649e72070cbbb8600eb823833e4748f5a0815116 category: bugfix issue: #ICG2DK CVE: CVE-2022-49801 Signed-off-by: Tengda Wu --------------------------------------- kmemleak reports this issue: unreferenced object 0xffff888105a18900 (size 128): comm "test_progs", pid 18933, jiffies 4336275356 (age 22801.766s) hex dump (first 32 bytes): 25 73 00 90 81 88 ff ff 26 05 00 00 42 01 58 04 %s......&...B.X. 03 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000560143a1>] __kmalloc_node_track_caller+0x4a/0x140 [<000000006af00822>] krealloc+0x8d/0xf0 [<00000000c309be6a>] trace_iter_expand_format+0x99/0x150 [<000000005a53bdb6>] trace_check_vprintf+0x1e0/0x11d0 [<0000000065629d9d>] trace_event_printf+0xb6/0xf0 [<000000009a690dc7>] trace_raw_output_bpf_trace_printk+0x89/0xc0 [<00000000d22db172>] print_trace_line+0x73c/0x1480 [<00000000cdba76ba>] tracing_read_pipe+0x45c/0x9f0 [<0000000015b58459>] vfs_read+0x17b/0x7c0 [<000000004aeee8ed>] ksys_read+0xed/0x1c0 [<0000000063d3d898>] do_syscall_64+0x3b/0x90 [<00000000a06dda7f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd iter->fmt alloced in tracing_read_pipe() -> .. ->trace_iter_expand_format(), but not freed, to fix, add free in tracing_release_pipe() Link: https://lkml.kernel.org/r/1667819090-4643-1-git-send-email-wangyufen@huawei.com Cc: stable@vger.kernel.org Fixes: efbbdaa22bb7 ("tracing: Show real address for trace event arguments") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Wang Yufen Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ff88eb2850d..518abcd89296 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6389,6 +6389,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); + kfree(iter->fmt); kfree(iter->temp); mutex_destroy(&iter->mutex); kfree(iter); -- Gitee From 1c942a3eedfac0cd9e331fc1ecaf4d85a95d7265 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 5 May 2025 21:58:04 +0200 Subject: [PATCH 05/17] bpf: Scrub packet on bpf_redirect_peer mainline inclusion from mainline-v6.15-rc6 commit c4327229948879814229b46aa26a750718888503 category: bugfix issue: #ICG2DK CVE: CVE-2025-37959 Signed-off-by: Tengda Wu --------------------------------------- When bpf_redirect_peer is used to redirect packets to a device in another network namespace, the skb isn't scrubbed. That can lead skb information from one namespace to be "misused" in another namespace. As one example, this is causing Cilium to drop traffic when using bpf_redirect_peer to redirect packets that just went through IPsec decryption to a container namespace. The following pwru trace shows (1) the packet path from the host's XFRM layer to the container's XFRM layer where it's dropped and (2) the number of active skb extensions at each function. NETNS MARK IFACE TUPLE FUNC 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 xfrm_rcv_cb .active_extensions = (__u8)2, 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 xfrm4_rcv_cb .active_extensions = (__u8)2, 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 gro_cells_receive .active_extensions = (__u8)2, [...] 4026533547 0 eth0 10.244.3.124:35473->10.244.2.158:53 skb_do_redirect .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 ip_rcv .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 ip_rcv_core .active_extensions = (__u8)2, [...] 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 udp_queue_rcv_one_skb .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 __xfrm_policy_check .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 __xfrm_decode_session .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 security_xfrm_decode_session .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 kfree_skb_reason(SKB_DROP_REASON_XFRM_POLICY) .active_extensions = (__u8)2, In this case, there are no XFRM policies in the container's network namespace so the drop is unexpected. When we decrypt the IPsec packet, the XFRM state used for decryption is set in the skb extensions. This information is preserved across the netns switch. When we reach the XFRM policy check in the container's netns, __xfrm_policy_check drops the packet with LINUX_MIB_XFRMINNOPOLS because a (container-side) XFRM policy can't be found that matches the (host-side) XFRM state used for decryption. This patch fixes this by scrubbing the packet when using bpf_redirect_peer, as is done on typical netns switches via veth devices except skb->mark and skb->tstamp are not zeroed. Fixes: 9aa1206e8f482 ("bpf: Add redirect_peer helper") Signed-off-by: Paul Chaignon Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/1728ead5e0fe45e7a6542c36bd4e3ca07a73b7d6.1746460653.git.paul.chaignon@gmail.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index 6322613a421d..7b4cf9537f2d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2500,6 +2500,7 @@ int skb_do_redirect(struct sk_buff *skb) net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; + skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? -- Gitee From 2380dacb59b5139b67abd7915a64272a31d8e69e Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Sat, 12 Apr 2025 17:25:54 +0800 Subject: [PATCH 06/17] block: fix resource leak in blk_register_queue() error path mainline inclusion from mainline-v6.15-rc3 commit 40f2eb9b531475dd01b683fdaf61ca3cfd03a51e category: bugfix issue: #ICG2DK CVE: CVE-2025-37980 Signed-off-by: Tengda Wu --------------------------------------- When registering a queue fails after blk_mq_sysfs_register() is successful but the function later encounters an error, we need to clean up the blk_mq_sysfs resources. Add the missing blk_mq_sysfs_unregister() call in the error path to properly clean up these resources and prevent a memory leak. Fixes: 320ae51feed5 ("blk-mq: new multi-queue block IO queueing mechanism") Signed-off-by: Zheng Qixing Reviewed-by: Christoph Hellwig Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250412092554.475218-1-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8e0827c30c46..91fd9ee7fe22 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -892,6 +892,8 @@ int blk_register_queue(struct gendisk *disk) if (ret) { mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); + if (queue_is_mq(q)) + blk_mq_unregister_dev(dev, q); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); -- Gitee From 93ef3e93dd514ffa36738e6941ccae22c336aa8b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 13 Mar 2022 21:15:02 -1000 Subject: [PATCH 07/17] block: fix rq-qos breakage from skipping rq_qos_done_bio() mainline inclusion from mainline-v5.18-rc1 commit aa1b46dcdc7baaf5fec0be25782ef24b26aa209e category: bugfix issue: #ICG2DK CVE: CVE-2022-49266 Signed-off-by: Tengda Wu --------------------------------------- a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") made bio_endio() skip rq_qos_done_bio() if BIO_TRACKED is not set. While this fixed a potential oops, it also broke blk-iocost by skipping the done_bio callback for merged bios. Before, whether a bio goes through rq_qos_throttle() or rq_qos_merge(), rq_qos_done_bio() would be called on the bio on completion with BIO_TRACKED distinguishing the former from the latter. rq_qos_done_bio() is not called for bios which wenth through rq_qos_merge(). This royally confuses blk-iocost as the merged bios never finish and are considered perpetually in-flight. One reliably reproducible failure mode is an intermediate cgroup geting stuck active preventing its children from being activated due to the leaf-only rule, leading to loss of control. The following is from resctl-bench protection scenario which emulates isolating a web server like workload from a memory bomb run on an iocost configuration which should yield a reasonable level of protection. # cat /sys/block/nvme2n1/device/model Samsung SSD 970 PRO 512GB # cat /sys/fs/cgroup/io.cost.model 259:0 ctrl=user model=linear rbps=834913556 rseqiops=93622 rrandiops=102913 wbps=618985353 wseqiops=72325 wrandiops=71025 # cat /sys/fs/cgroup/io.cost.qos 259:0 enable=1 ctrl=user rpct=95.00 rlat=18776 wpct=95.00 wlat=8897 min=60.00 max=100.00 # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=242u:336u/2.5m p90=794u:1.4m/7.5m p99=2.7m:8.0m/62.5m max=8.0m:36.4m/350m W p50=221u:323u/1.5m p90=709u:1.2m/5.5m p99=1.5m:2.5m/9.5m max=6.9m:35.9m/350m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 15.90 15.90 15.90 40.05 57.24 59.07 60.01 74.63 74.63 90.35 90.35 58.12 15.82 lat-imp% 0 0 0 0 0 4.55 14.68 15.54 233.5 548.1 548.1 53.88 143.6 Result: isol=58.12:15.82% lat_imp=53.88%:143.6 work_csv=100.0% missing=3.96% The isolation result of 58.12% is close to what this device would show without any IO control. Fix it by introducing a new flag BIO_QOS_MERGED to mark merged bios and calling rq_qos_done_bio() on them too. For consistency and clarity, rename BIO_TRACKED to BIO_QOS_THROTTLED. The flag checks are moved into rq_qos_done_bio() so that it's next to the code paths that set the flags. With the patch applied, the above same benchmark shows: # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=123u:84.4u/985u p90=322u:256u/2.5m p99=1.6m:1.4m/9.5m max=11.1m:36.0m/350m W p50=429u:274u/995u p90=1.7m:1.3m/4.5m p99=3.4m:2.7m/11.5m max=7.9m:5.9m/26.5m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 84.91 84.91 89.51 90.73 92.31 94.49 96.36 98.04 98.71 100.0 100.0 94.42 2.81 lat-imp% 0 0 0 0 0 2.81 5.73 11.11 13.92 17.53 22.61 4.10 4.68 Result: isol=94.42:2.81% lat_imp=4.10%:4.68 work_csv=58.34% missing=0% Signed-off-by: Tejun Heo Fixes: a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") Cc: stable@vger.kernel.org # v5.15+ Cc: Ming Lei Cc: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/Yi7rdrzQEHjJLGKB@slm.duckdns.org Signed-off-by: Jens Axboe --- block/bio.c | 3 +-- block/blk-iolatency.c | 2 +- block/blk-rq-qos.h | 23 +++++++++++++---------- include/linux/blk_types.h | 3 ++- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3caf222b50d5..fdaf42db55af 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1431,8 +1431,7 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk && bio_flagged(bio, BIO_TRACKED)) - rq_qos_done_bio(bio->bi_disk->queue, bio); + rq_qos_done_bio(bio); /* * Need to have a real endio function for chained bios, otherwise diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 74511a060d59..5fd3f3a8f88c 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -601,7 +601,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) int inflight = 0; blkg = bio->bi_blkg; - if (!blkg || !bio_flagged(bio, BIO_TRACKED)) + if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) return; iolat = blkg_to_lat(bio->bi_blkg); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bcb3495e376..db83d01b62d0 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -189,21 +189,22 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) __rq_qos_requeue(q->rq_qos, rq); } -static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +static inline void rq_qos_done_bio(struct bio *bio) { - if (q->rq_qos) - __rq_qos_done_bio(q->rq_qos, bio); + if (bio->bi_disk && (bio_flagged(bio, BIO_QOS_THROTTLED) || + bio_flagged(bio, BIO_QOS_MERGED))) { + struct request_queue *q = bio->bi_disk->queue; + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); + } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - /* - * BIO_TRACKED lets controllers know that a bio went through the - * normal rq_qos path. - */ - bio_set_flag(bio, BIO_TRACKED); - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); + } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, @@ -216,8 +217,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); + } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..afe45c26501a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -283,7 +283,8 @@ enum { BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ - BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ + BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_FLAG_LAST }; -- Gitee From f6e3c940d6dca8d2c613b1304e59c0566e0a813b Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 18 Mar 2025 13:22:55 +0530 Subject: [PATCH 08/17] ext4: define ext4_journal_destroy wrapper mainline inclusion from mainline-v6.15-rc1 commit 5a02a6204ca37e7c22fbb55a789c503f05e8e89a category: bugfix issue: #ICG2DK CVE: CVE-2025-22113 Signed-off-by: Tengda Wu --------------------------------------- Define an ext4 wrapper over jbd2_journal_destroy to make sure we have consistent behavior during journal destruction. This will also come useful in the next patch where we add some ext4 specific logic in the destroy path. Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/c3ba78c5c419757e6d5f2d8ebb4a8ce9d21da86a.1742279837.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.h | 14 ++++++++++++++ fs/ext4/super.c | 13 +++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b9881ee1bf93..2a56017df396 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -510,4 +510,18 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8919c5d1a9cb..413765841d6f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1189,8 +1189,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } @@ -5170,8 +5169,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_journal) { /* flush s_error_work before journal destroy. */ flush_work(&sbi->s_error_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); @@ -5384,7 +5382,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: ext4_blkdev_put(bdev); return NULL; @@ -5484,8 +5482,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -5503,7 +5500,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } -- Gitee From 42a173ff137011983627c1a52bf8acc5c901344e Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 18 Mar 2025 13:22:56 +0530 Subject: [PATCH 09/17] ext4: avoid journaling sb update on error if journal is destroying mainline inclusion from mainline-v6.15-rc1 commit ce2f26e73783b4a7c46a86e3af5b5c8de0971790 category: bugfix issue: #ICG2DK CVE: CVE-2025-22113 Signed-off-by: Tengda Wu --------------------------------------- Presently we always BUG_ON if trying to start a transaction on a journal marked with JBD2_UNMOUNT, since this should never happen. However, while ltp running stress tests, it was observed that in case of some error handling paths, it is possible for update_super_work to start a transaction after the journal is destroyed eg: (umount) ext4_kill_sb kill_block_super generic_shutdown_super sync_filesystem /* commits all txns */ evict_inodes /* might start a new txn */ ext4_put_super flush_work(&sbi->s_sb_upd_work) /* flush the workqueue */ jbd2_journal_destroy journal_kill_thread journal->j_flags |= JBD2_UNMOUNT; jbd2_journal_commit_transaction jbd2_journal_get_descriptor_buffer jbd2_journal_bmap ext4_journal_bmap ext4_map_blocks ... ext4_inode_error ext4_handle_error schedule_work(&sbi->s_sb_upd_work) /* work queue kicks in */ update_super_work jbd2_journal_start start_this_handle BUG_ON(journal->j_flags & JBD2_UNMOUNT) Hence, introduce a new mount flag to indicate journal is destroying and only do a journaled (and deferred) update of sb if this flag is not set. Otherwise, just fallback to an un-journaled commit. Further, in the journal destroy path, we have the following sequence: 1. Set mount flag indicating journal is destroying 2. force a commit and wait for it 3. flush pending sb updates This sequence is important as it ensures that, after this point, there is no sb update that might be journaled so it is safe to update the sb outside the journal. (To avoid race discussed in 2d01ddc86606) Also, we don't need a similar check in ext4_grp_locked_error since it is only called from mballoc and AFAICT it would be always valid to schedule work here. Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available") Reported-by: Mahesh Kumar Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://patch.msgid.link/9613c465d6ff00cd315602f99283d5f24018c3f7.1742279837.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/ext4_jbd2.h | 15 +++++++++++++++ fs/ext4/super.c | 14 +++++++++----- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e3bb368a1559..1399b7cd6803 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1710,9 +1710,10 @@ enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ - EXT4_MF_FC_COMMITTING /* File system underoing a fast + EXT4_MF_FC_COMMITTING, /* File system underoing a fast * commit. */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2a56017df396..53f81ca2b076 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -518,6 +518,21 @@ static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *jour { int err = 0; + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_error_work); + err = jbd2_journal_destroy(journal); sbi->s_journal = NULL; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 413765841d6f..cf197f3791eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -653,9 +653,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_error_work); else ext4_commit_super(sb); @@ -1184,7 +1188,6 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); - flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { @@ -1193,7 +1196,9 @@ static void ext4_put_super(struct super_block *sb) if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_error_work); + ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); @@ -5168,7 +5173,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_journal) { /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: -- Gitee From afe1d08988e068c5efef49b6ade3ec2baa3428b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Nov 2021 17:45:55 -0700 Subject: [PATCH 10/17] net: add and use skb_unclone_keeptruesize() helper mainline inclusion from mainline-v5.16-rc1 commit c4777efa751d293e369aec464ce6875e957be255 category: bugfix issue: #ICG2DK CVE: CVE-2022-49142 Signed-off-by: Tengda Wu --------------------------------------- While commit 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") fixed immediate issues found when KFENCE was enabled/tested, there are still similar issues, when tcp_trim_head() hits KFENCE while the master skb is cloned. This happens under heavy networking TX workloads, when the TX completion might be delayed after incoming ACK. This patch fixes the WARNING in sk_stream_kill_queues when sk->sk_mem_queued/sk->sk_forward_alloc are not zero. Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Eric Dumazet Acked-by: Marco Elver Link: https://lore.kernel.org/r/20211102004555.1359210-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 ++++++++++++++++ net/core/skbuff.c | 14 +------------- net/ipv4/tcp_output.c | 6 +++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4e9b638f7587..dd3f6bfffe2d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1647,6 +1647,22 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } +/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + might_sleep_if(gfpflags_allow_blocking(pri)); + + if (skb_cloned(skb)) { + unsigned int save = skb->truesize; + int res; + + res = pskb_expand_head(skb, 0, 0, pri); + skb->truesize = save; + return res; + } + return 0; +} + /** * skb_header_cloned - is the header a clone * @skb: buffer to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fca5e663f6b8..49fc51795d89 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3331,19 +3331,7 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - int ret = 0; - - if (skb_cloned(skb)) { - /* Save and restore truesize: pskb_expand_head() may reallocate - * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we - * cannot change truesize at this point. - */ - unsigned int save_truesize = skb->truesize; - - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - skb->truesize = save_truesize; - } - return ret; + return skb_unclone_keeptruesize(skb, GFP_ATOMIC); } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b381f3a748dd..87ac4597d871 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1560,7 +1560,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; } - if (skb_unclone(skb, gfp)) + if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ @@ -1669,7 +1669,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); @@ -3216,7 +3216,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); -- Gitee From b54807fa2bf702f41e52893ff3fc6d6a93dce6a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:12 -0800 Subject: [PATCH 11/17] net: add skb_set_end_offset() helper mainline inclusion from mainline-v5.18-rc1 commit 763087dab97547230a6807c865a6a5ae53a59247 category: bugfix issue: #ICG2DK CVE: CVE-2022-49142 Signed-off-by: Tengda Wu --------------------------------------- We have multiple places where this helper is convenient, and plan using it in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 10 ++++++++++ net/core/skbuff.c | 19 +++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dd3f6bfffe2d..d440d522c3da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1418,6 +1418,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = offset; +} #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { @@ -1428,6 +1433,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = skb->head + offset; +} #endif /* Internal */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 49fc51795d89..305861af9ef3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -230,7 +230,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; + skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; @@ -1679,11 +1679,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->head = data; skb->head_frag = 0; skb->data += off; + + skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; off = nhead; -#else - skb->end = skb->head + size; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); @@ -6019,11 +6018,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb->head = data; skb->data = data; skb->head_frag = 0; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; @@ -6161,11 +6156,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb->head = data; skb->head_frag = 0; skb->data = data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; -- Gitee From 0a68665bf721ba15e87e9b001fd44371a45eba58 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:13 -0800 Subject: [PATCH 12/17] net: preserve skb_end_offset() in skb_unclone_keeptruesize() mainline inclusion from mainline-v5.18-rc1 commit 2b88cba55883eaafbc9b7cbff0b2c7cdba71ed01 category: bugfix issue: #ICG2DK CVE: CVE-2022-49142 Signed-off-by: Tengda Wu --------------------------------------- syzbot found another way to trigger the infamous WARN_ON_ONCE(delta < len) in skb_try_coalesce() [1] I was able to root cause the issue to kfence. When kfence is in action, the following assertion is no longer true: int size = xxxx; void *ptr1 = kmalloc(size, gfp); void *ptr2 = kmalloc(size, gfp); if (ptr1 && ptr2) ASSERT(ksize(ptr1) == ksize(ptr2)); We attempted to fix these issues in the blamed commits, but forgot that TCP was possibly shifting data after skb_unclone_keeptruesize() has been used, notably from tcp_retrans_try_collapse(). So we not only need to keep same skb->truesize value, we also need to make sure TCP wont fill new tailroom that pskb_expand_head() was able to get from a addr = kmalloc(...) followed by ksize(addr) Split skb_unclone_keeptruesize() into two parts: 1) Inline skb_unclone_keeptruesize() for the common case, when skb is not cloned. 2) Out of line __skb_unclone_keeptruesize() for the 'slow path'. WARNING: CPU: 1 PID: 6490 at net/core/skbuff.c:5295 skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Modules linked in: CPU: 1 PID: 6490 Comm: syz-executor161 Not tainted 5.17.0-rc4-syzkaller-00229-g4f12b742eb2b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Code: bf 01 00 00 00 0f b7 c0 89 c6 89 44 24 20 e8 62 24 4e fa 8b 44 24 20 83 e8 01 0f 85 e5 f0 ff ff e9 87 f4 ff ff e8 cb 20 4e fa <0f> 0b e9 06 f9 ff ff e8 af b2 95 fa e9 69 f0 ff ff e8 95 b2 95 fa RSP: 0018:ffffc900063af268 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 00000000ffffffd5 RCX: 0000000000000000 RDX: ffff88806fc05700 RSI: ffffffff872abd55 RDI: 0000000000000003 RBP: ffff88806e675500 R08: 00000000ffffffd5 R09: 0000000000000000 R10: ffffffff872ab659 R11: 0000000000000000 R12: ffff88806dd554e8 R13: ffff88806dd9bac0 R14: ffff88806dd9a2c0 R15: 0000000000000155 FS: 00007f18014f9700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020002000 CR3: 000000006be7a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_try_coalesce net/ipv4/tcp_input.c:4651 [inline] tcp_try_coalesce+0x393/0x920 net/ipv4/tcp_input.c:4630 tcp_queue_rcv+0x8a/0x6e0 net/ipv4/tcp_input.c:4914 tcp_data_queue+0x11fd/0x4bb0 net/ipv4/tcp_input.c:5025 tcp_rcv_established+0x81e/0x1ff0 net/ipv4/tcp_input.c:5947 tcp_v4_do_rcv+0x65e/0x980 net/ipv4/tcp_ipv4.c:1719 sk_backlog_rcv include/net/sock.h:1037 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2779 release_sock+0x54/0x1b0 net/core/sock.c:3311 sk_wait_data+0x177/0x450 net/core/sock.c:2821 tcp_recvmsg_locked+0xe28/0x1fd0 net/ipv4/tcp.c:2457 tcp_recvmsg+0x137/0x610 net/ipv4/tcp.c:2572 inet_recvmsg+0x11b/0x5e0 net/ipv4/af_inet.c:850 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_recvmsg net/socket.c:962 [inline] ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632 ___sys_recvmsg+0x127/0x200 net/socket.c:2674 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: c4777efa751d ("net: add and use skb_unclone_keeptruesize() helper") Fixes: 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Marco Elver Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 18 +++++++++--------- net/core/skbuff.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d440d522c3da..751d1e3535b1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1657,19 +1657,19 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } -/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +/* This variant of skb_unclone() makes sure skb->truesize + * and skb_end_offset() are not changed, whenever a new skb->head is needed. + * + * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) + * when various debugging features are in place. + */ +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); - if (skb_cloned(skb)) { - unsigned int save = skb->truesize; - int res; - - res = pskb_expand_head(skb, 0, 0, pri); - skb->truesize = save; - return res; - } + if (skb_cloned(skb)) + return __skb_unclone_keeptruesize(skb, pri); return 0; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 305861af9ef3..1d55dc759f62 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1730,6 +1730,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + /** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy -- Gitee From 5bbb81ef93253b06a9923aff181cc71e20426bc3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 6 Dec 2024 12:40:11 +0000 Subject: [PATCH 13/17] net: stmmac: fix TSO DMA API usage causing oops mainline inclusion from mainline-v6.13-rc3 commit 4c49f38e20a57f8abaebdf95b369295b153d1f8e category: bugfix issue: #ICG2DK CVE: CVE-2024-56719 Signed-off-by: Tengda Wu --------------------------------------- Commit 66600fac7a98 ("net: stmmac: TSO: Fix unbalanced DMA map/unmap for non-paged SKB data") moved the assignment of tx_skbuff_dma[]'s members to be later in stmmac_tso_xmit(). The buf (dma cookie) and len stored in this structure are passed to dma_unmap_single() by stmmac_tx_clean(). The DMA API requires that the dma cookie passed to dma_unmap_single() is the same as the value returned from dma_map_single(). However, by moving the assignment later, this is not the case when priv->dma_cap.addr64 > 32 as "des" is offset by proto_hdr_len. This causes problems such as: dwc-eth-dwmac 2490000.ethernet eth0: Tx DMA map failed and with DMA_API_DEBUG enabled: DMA-API: dwc-eth-dwmac 2490000.ethernet: device driver tries to +free DMA memory it has not allocated [device address=0x000000ffffcf65c0] [size=66 bytes] Fix this by maintaining "des" as the original DMA cookie, and use tso_des to pass the offset DMA cookie to stmmac_tso_allocator(). Full details of the crashes can be found at: https://lore.kernel.org/all/d8112193-0386-4e14-b516-37c2d838171a@nvidia.com/ https://lore.kernel.org/all/klkzp5yn5kq5efgtrow6wbvnc46bcqfxs65nz3qy77ujr5turc@bwwhelz2l4dw/ Reported-by: Jon Hunter Reported-by: Thierry Reding Fixes: 66600fac7a98 ("net: stmmac: TSO: Fix unbalanced DMA map/unmap for non-paged SKB data") Tested-by: Jon Hunter Signed-off-by: Russell King (Oracle) Reviewed-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/E1tJXcx-006N4Z-PC@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 650c7d4be82f..96ac731fd478 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3180,9 +3180,9 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) unsigned int first_entry, tx_packets; struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; + dma_addr_t tso_des, des; u8 proto_hdr_len, hdr; u32 pay_len, mss; - dma_addr_t des; int i; tx_q = &priv->tx_queue[queue]; @@ -3269,14 +3269,15 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) /* If needed take extra descriptors to fill the remaining payload */ tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE; + tso_des = des; } else { stmmac_set_desc_addr(priv, first, des); tmp_pay_len = pay_len; - des += proto_hdr_len; + tso_des = des + proto_hdr_len; pay_len = 0; } - stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue); + stmmac_tso_allocator(priv, tso_des, tmp_pay_len, (nfrags == 0), queue); /* Prepare fragments */ for (i = 0; i < nfrags; i++) { -- Gitee From 2a26c7d266a174d1ec4313e95b60e4516bf819be Mon Sep 17 00:00:00 2001 From: Hyeong-Jun Kim Date: Tue, 2 Nov 2021 16:10:02 +0900 Subject: [PATCH 14/17] f2fs: invalidate META_MAPPING before IPU/DIO write mainline inclusion from mainline-v5.16-rc1 commit e3b49ea36802053f312013fd4ccb6e59920a9f76 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- Encrypted pages during GC are read and cached in META_MAPPING. However, due to cached pages in META_MAPPING, there is an issue where newly written pages are lost by IPU or DIO writes. Thread A - f2fs_gc() Thread B /* phase 3 */ down_write(i_gc_rwsem) ra_data_block() ---- (a) up_write(i_gc_rwsem) f2fs_direct_IO() : - down_read(i_gc_rwsem) - __blockdev_direct_io() - get_data_block_dio_write() - f2fs_dio_submit_bio() ---- (b) - up_read(i_gc_rwsem) /* phase 4 */ down_write(i_gc_rwsem) move_data_block() ---- (c) up_write(i_gc_rwsem) (a) In phase 3 of f2fs_gc(), up-to-date page is read from storage and cached in META_MAPPING. (b) In thread B, writing new data by IPU or DIO write on same blkaddr as read in (a). cached page in META_MAPPING become out-dated. (c) In phase 4 of f2fs_gc(), out-dated page in META_MAPPING is copied to new blkaddr. In conclusion, the newly written data in (b) is lost. To address this issue, invalidating pages in META_MAPPING before IPU or DIO write. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Signed-off-by: Hyeong-Jun Kim Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: May27May Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 5 ++++- fs/f2fs/segment.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fd8722da9c4f..709323fb5a02 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1719,9 +1719,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, sync_out: /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk); + } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2a54d7b89dd6..73fffb11d42c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3725,6 +3725,9 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return -EFSCORRUPTED; } + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) -- Gitee From 94e0b6bc863ae60295d43dd24af483558b1f8bd1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Jul 2022 14:30:15 +0800 Subject: [PATCH 15/17] f2fs: fix to invalidate META_MAPPING before DIO write mainline inclusion from mainline-v6.0-rc1 commit 67ca06872eb02944b4c6f92cffa9242e92c63109 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- Quoted from commit e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") " Encrypted pages during GC are read and cached in META_MAPPING. However, due to cached pages in META_MAPPING, there is an issue where newly written pages are lost by IPU or DIO writes. Thread A - f2fs_gc() Thread B /* phase 3 */ down_write(i_gc_rwsem) ra_data_block() ---- (a) up_write(i_gc_rwsem) f2fs_direct_IO() : - down_read(i_gc_rwsem) - __blockdev_direct_io() - get_data_block_dio_write() - f2fs_dio_submit_bio() ---- (b) - up_read(i_gc_rwsem) /* phase 4 */ down_write(i_gc_rwsem) move_data_block() ---- (c) up_write(i_gc_rwsem) (a) In phase 3 of f2fs_gc(), up-to-date page is read from storage and cached in META_MAPPING. (b) In thread B, writing new data by IPU or DIO write on same blkaddr as read in (a). cached page in META_MAPPING become out-dated. (c) In phase 4 of f2fs_gc(), out-dated page in META_MAPPING is copied to new blkaddr. In conclusion, the newly written data in (b) is lost. To address this issue, invalidating pages in META_MAPPING before IPU or DIO write. " In previous commit, we missed to cover extent cache hit case, and passed wrong value for parameter @end of invalidate_mapping_pages(), fix both issues. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Fixes: e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") Cc: Hyeong-Jun Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: May27May Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 709323fb5a02..0f33084e149e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1540,9 +1540,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, *map->m_next_extent = pgofs + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) + if (flag == F2FS_GET_BLOCK_DIO) { f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk + map->m_len - 1); + } goto out; } @@ -1723,7 +1726,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk); + map->m_pblk, map->m_pblk + map->m_len - 1); } if (flag == F2FS_GET_BLOCK_PRECACHE) { -- Gitee From 7a4693067306b1515987f63ccb20889be8c07e0b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jul 2022 23:26:43 +0800 Subject: [PATCH 16/17] f2fs: invalidate meta pages only for post_read required inode mainline inclusion from mainline-v6.0-rc1 commit 0d5b9d8156396bbe1c982708b38ab9e188c45ec9 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- After commit e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write"), invalidate_mapping_pages() will be called to avoid race condition in between IPU/DIO and readahead for GC. However, readahead flow is only used for post_read required inode, so this patch adds check condition to avoids unnecessary page cache invalidating for non-post_read inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: May27May Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 11 +++-------- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 9 ++++++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0f33084e149e..8b8981d426cc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1540,12 +1540,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, *map->m_next_extent = pgofs + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) { + if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); - invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk + map->m_len - 1); - } goto out; } @@ -1722,12 +1719,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, sync_out: /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); - invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk + map->m_len - 1); - } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { @@ -2809,6 +2803,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .submitted = false, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, + .post_read = f2fs_post_read_required(inode), .io_type = io_type, .io_wbc = wbc, .bio = bio, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 702ac7d98333..82a381d62252 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1143,6 +1143,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ int compr_blocks; /* # of compressed block addresses */ bool encrypted; /* indicate file is encrypted */ + bool post_read; /* require post read */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 73fffb11d42c..7a7cc9630362 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3725,7 +3725,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return -EFSCORRUPTED; } - invalidate_mapping_pages(META_MAPPING(sbi), + if (fio->post_read) + invalidate_mapping_pages(META_MAPPING(sbi), fio->new_blkaddr, fio->new_blkaddr); stat_inc_inplace_blocks(fio->sbi); @@ -3894,10 +3895,16 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; + if (!f2fs_post_read_required(inode)) + return; + for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); + + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) -- Gitee From 641c350bda48b24b41fe9939db9fa5af8feaaef7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 Mar 2024 09:08:34 +0800 Subject: [PATCH 17/17] f2fs: fix to truncate meta inode pages forcely stable inclusion from stable-v6.6.23 commit c92f2927df860a60ba815d3ee610a944b92a8694 category: bugfix issue: #ICG2DK CVE: CVE-2024-26869 Signed-off-by: May27May --------------------------------------- [ Upstream commit 9f0c4a46be1fe9b97dbe66d49204c1371e3ece65 ] Below race case can cause data corruption: Thread A GC thread - gc_data_segment - ra_data_block - locked meta_inode page - f2fs_inplace_write_data - invalidate_mapping_pages : fail to invalidate meta_inode page due to lock failure or dirty|writeback status - f2fs_submit_page_bio : write last dirty data to old blkaddr - move_data_block - load old data from meta_inode page - f2fs_submit_page_write : write old data to new blkaddr Because invalidate_mapping_pages() will skip invalidating page which has unclear status including locked, dirty, writeback and so on, so we need to use truncate_inode_pages_range() instead of invalidate_mapping_pages() to make sure meta_inode page will be dropped. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Fixes: e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin Signed-off-by: May27May Conflicts: fs/f2fs/f2fs.h --- fs/f2fs/checkpoint.c | 5 +++-- fs/f2fs/f2fs.h | 26 ++++++++++++++++++++++++++ fs/f2fs/segment.c | 5 ++--- include/linux/f2fs_fs.h | 1 + 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8ca549cc975e..8a94bf3611d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1551,8 +1551,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) */ if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || f2fs_sb_has_compression(sbi)) - invalidate_mapping_pages(META_MAPPING(sbi), - MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); + f2fs_bug_on(sbi, + invalidate_inode_pages2_range(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1)); f2fs_release_ino_entry(sbi, false); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 82a381d62252..80c79159d7b6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4187,6 +4187,32 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int cnt) +{ + bool need_submit = false; + int i = 0; + + do { + struct page *page; + + page = find_get_page(META_MAPPING(sbi), blkaddr + i); + if (page) { + if (PageWriteback(page)) + need_submit = true; + f2fs_put_page(page, 0); + } + } while (++i < cnt && !need_submit); + + if (need_submit) + f2fs_submit_merged_write_cond(sbi, sbi->meta_inode, + NULL, 0, DATA); + + truncate_inode_pages_range(META_MAPPING(sbi), + F2FS_BLK_TO_BYTES((loff_t)blkaddr), + F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1))); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7a7cc9630362..f76cab61c0ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3726,8 +3726,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) } if (fio->post_read) - invalidate_mapping_pages(META_MAPPING(sbi), - fio->new_blkaddr, fio->new_blkaddr); + f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); stat_inc_inplace_blocks(fio->sbi); @@ -3904,7 +3903,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); - invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); + f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index a5dbb57a687f..7fd4bea6be64 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -27,6 +27,7 @@ #define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS) #define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS) +#define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -- Gitee