From 7450cfada3594bdb15cb4e316f2dc58f5b6b109e Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:39:58 +0800 Subject: [PATCH 01/58] rds: rds_rm_zerocopy_callback() use list_first_entry() ANBZ: #4540 commit f753a68980cf4b59a80fe677619da2b1804f526d upstream. rds_rm_zerocopy_callback() uses list_entry() on the head of a list causing a type confusion. Use list_first_entry() to actually access the first element of the rs_zcookie_queue list. Fixes: 9426bbc6de99 ("rds: use list structure to track information for zerocopy completion notification") Reviewed-by: Willem de Bruijn Signed-off-by: Pietro Borrello Link: https://lore.kernel.org/r/20230202-rds-zerocopy-v3-1-83b0df974f9a@diag.uniroma1.it Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Fixes: CVE-2023-1078 Signed-off-by: Xiao Long Reviewed-by: Xuan Zhuo Link: https://gitee.com/anolis/cloud-kernel/pulls/1604 --- net/rds/message.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/rds/message.c b/net/rds/message.c index 4b00b1152a5f..309b54cc62ae 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_lock_irqsave(&q->lock, flags); head = &q->zcookie_head; if (!list_empty(head)) { - info = list_entry(head, struct rds_msg_zcopy_info, - rs_zcookie_next); - if (info && rds_zcookie_add(info, cookie)) { + info = list_first_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (rds_zcookie_add(info, cookie)) { spin_unlock_irqrestore(&q->lock, flags); kfree(rds_info_from_znotifier(znotif)); /* caller invokes rds_wake_sk_sleep() */ -- Gitee From f9fd06dad1404e060d05d53515b2016cfba23144 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:39:58 +0800 Subject: [PATCH 02/58] selinux: fix race condition when computing ocontext SIDs ANBZ: #5576 commit cbfcd13be5cb2a07868afe67520ed181956579a7 upstream. Signed-off-by: Ondrej Mosnacek Signed-off-by: Paul Moore Reviewed-by: Tianjia Zhang Link: https://gitee.com/anolis/cloud-kernel/pulls/1773 --- security/selinux/ss/services.c | 159 ++++++++++++++++++--------------- 1 file changed, 87 insertions(+), 72 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index f3def298a90e..12763b6c7d51 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2263,6 +2263,43 @@ size_t security_policydb_len(struct selinux_state *state) return len; } +/** + * ocontext_to_sid - Helper to safely get sid for an ocontext + * @sidtab: SID table + * @c: ocontext structure + * @index: index of the context entry (0 or 1) + * @out_sid: pointer to the resulting SID value + * + * For all ocontexts except OCON_ISID the SID fields are populated + * on-demand when needed. Since updating the SID value is an SMP-sensitive + * operation, this helper must be used to do that safely. + * + * WARNING: This function may return -ESTALE, indicating that the caller + * must retry the operation after re-acquiring the policy pointer! + */ +static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c, + size_t index, u32 *out_sid) +{ + int rc; + u32 sid; + + /* Ensure the associated sidtab entry is visible to this thread. */ + sid = smp_load_acquire(&c->sid[index]); + if (!sid) { + rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid); + if (rc) + return rc; + + /* + * Ensure the new sidtab entry is visible to other threads + * when they see the SID. + */ + smp_store_release(&c->sid[index], sid); + } + *out_sid = sid; + return 0; +} + /** * security_port_sid - Obtain the SID for a port. * @protocol: protocol number @@ -2275,10 +2312,12 @@ int security_port_sid(struct selinux_state *state, struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; - int rc = 0; + int rc; read_lock(&state->ss->policy_rwlock); +retry: + rc = 0; policydb = &state->ss->policydb; sidtab = &state->ss->sidtab; @@ -2292,14 +2331,11 @@ int security_port_sid(struct selinux_state *state, } if (c) { - if (!c->sid[0]) { - rc = sidtab_context_to_sid(sidtab, - &c->context[0], - &c->sid[0]); - if (rc) - goto out; - } - *out_sid = c->sid[0]; + rc = ocontext_to_sid(sidtab, c, 0, out_sid); + if (rc == -ESTALE) + goto retry; + if (rc) + goto out; } else { *out_sid = SECINITSID_PORT; } @@ -2321,10 +2357,12 @@ int security_ib_pkey_sid(struct selinux_state *state, struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; - int rc = 0; + int rc; read_lock(&state->ss->policy_rwlock); +retry: + rc = 0; policydb = &state->ss->policydb; sidtab = &state->ss->sidtab; @@ -2339,14 +2377,11 @@ int security_ib_pkey_sid(struct selinux_state *state, } if (c) { - if (!c->sid[0]) { - rc = sidtab_context_to_sid(sidtab, - &c->context[0], - &c->sid[0]); - if (rc) - goto out; - } - *out_sid = c->sid[0]; + rc = ocontext_to_sid(sidtab, c, 0, out_sid); + if (rc == -ESTALE) + goto retry; + if (rc) + goto out; } else *out_sid = SECINITSID_UNLABELED; @@ -2367,10 +2402,12 @@ int security_ib_endport_sid(struct selinux_state *state, struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; - int rc = 0; + int rc; read_lock(&state->ss->policy_rwlock); +retry: + rc = 0; policydb = &state->ss->policydb; sidtab = &state->ss->sidtab; @@ -2386,14 +2423,11 @@ int security_ib_endport_sid(struct selinux_state *state, } if (c) { - if (!c->sid[0]) { - rc = sidtab_context_to_sid(sidtab, - &c->context[0], - &c->sid[0]); - if (rc) - goto out; - } - *out_sid = c->sid[0]; + rc = ocontext_to_sid(sidtab, c, 0, out_sid); + if (rc == -ESTALE) + goto retry; + if (rc) + goto out; } else *out_sid = SECINITSID_UNLABELED; @@ -2412,11 +2446,13 @@ int security_netif_sid(struct selinux_state *state, { struct policydb *policydb; struct sidtab *sidtab; - int rc = 0; + int rc; struct ocontext *c; read_lock(&state->ss->policy_rwlock); +retry: + rc = 0; policydb = &state->ss->policydb; sidtab = &state->ss->sidtab; @@ -2428,19 +2464,11 @@ int security_netif_sid(struct selinux_state *state, } if (c) { - if (!c->sid[0] || !c->sid[1]) { - rc = sidtab_context_to_sid(sidtab, - &c->context[0], - &c->sid[0]); - if (rc) - goto out; - rc = sidtab_context_to_sid(sidtab, - &c->context[1], - &c->sid[1]); - if (rc) - goto out; - } - *if_sid = c->sid[0]; + rc = ocontext_to_sid(sidtab, c, 0, if_sid); + if (rc == -ESTALE) + goto retry; + if (rc) + goto out; } else *if_sid = SECINITSID_NETIF; @@ -2482,6 +2510,7 @@ int security_node_sid(struct selinux_state *state, read_lock(&state->ss->policy_rwlock); +retry: policydb = &state->ss->policydb; sidtab = &state->ss->sidtab; @@ -2524,14 +2553,11 @@ int security_node_sid(struct selinux_state *state, } if (c) { - if (!c->sid[0]) { - rc = sidtab_context_to_sid(sidtab, - &c->context[0], - &c->sid[0]); - if (rc) - goto out; - } - *out_sid = c->sid[0]; + rc = ocontext_to_sid(sidtab, c, 0, out_sid); + if (rc == -ESTALE) + goto retry; + if (rc) + goto out; } else { *out_sid = SECINITSID_NODE; } @@ -2690,7 +2716,7 @@ static inline int __security_genfs_sid(struct selinux_state *state, u16 sclass; struct genfs *genfs; struct ocontext *c; - int rc, cmp = 0; + int cmp = 0; while (path[0] == '/' && path[1] == '/') path++; @@ -2704,9 +2730,8 @@ static inline int __security_genfs_sid(struct selinux_state *state, break; } - rc = -ENOENT; if (!genfs || cmp) - goto out; + return -ENOENT; for (c = genfs->head; c; c = c->next) { len = strlen(c->u.name); @@ -2715,20 +2740,10 @@ static inline int __security_genfs_sid(struct selinux_state *state, break; } - rc = -ENOENT; if (!c) - goto out; - - if (!c->sid[0]) { - rc = sidtab_context_to_sid(sidtab, &c->context[0], &c->sid[0]); - if (rc) - goto out; - } + return -ENOENT; - *sid = c->sid[0]; - rc = 0; -out: - return rc; + return ocontext_to_sid(sidtab, c, 0, sid); } /** @@ -2763,13 +2778,15 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb) { struct policydb *policydb; struct sidtab *sidtab; - int rc = 0; + int rc; struct ocontext *c; struct superblock_security_struct *sbsec = sb->s_security; const char *fstype = sb->s_type->name; read_lock(&state->ss->policy_rwlock); +retry: + rc = 0; policydb = &state->ss->policydb; sidtab = &state->ss->sidtab; @@ -2782,13 +2799,11 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb) if (c) { sbsec->behavior = c->v.behavior; - if (!c->sid[0]) { - rc = sidtab_context_to_sid(sidtab, &c->context[0], - &c->sid[0]); - if (rc) - goto out; - } - sbsec->sid = c->sid[0]; + rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid); + if (rc == -ESTALE) + goto retry; + if (rc) + goto out; } else { rc = __security_genfs_sid(state, fstype, "/", SECCLASS_DIR, &sbsec->sid); -- Gitee From 62fa27cd3b56870ea07a0c5cb1171c5568ccbe1c Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:39:59 +0800 Subject: [PATCH 03/58] fs: hfsplus: fix UAF issue in hfsplus_put_super ANBZ: #5381 commit 07db5e247ab5858439b14dd7cc1fe538b9efcf32 upstream. The current hfsplus_put_super first calls hfs_btree_close on sbi->ext_tree, then invokes iput on sbi->hidden_dir, resulting in an use-after-free issue in hfsplus_release_folio. As shown in hfsplus_fill_super, the error handling code also calls iput before hfs_btree_close. To fix this error, we move all iput calls before hfsplus_btree_close. Note that this patch is tested on Syzbot. Link: https://lkml.kernel.org/r/20230226124948.3175736-1-mudongliangabcd@gmail.com Reported-by: syzbot+57e3e98f7e3b80f64d56@syzkaller.appspotmail.com Tested-by: Dongliang Mu Signed-off-by: Dongliang Mu Cc: Bart Van Assche Cc: Jens Axboe Cc: Muchun Song Cc: Roman Gushchin Cc: "Theodore Ts'o" Cc: Signed-off-by: Andrew Morton Fixes: CVE-2023-2985 Signed-off-by: Gao Xiang Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1777 --- fs/hfsplus/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index eb4535eba95d..3b1356b10a47 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -294,11 +294,11 @@ static void hfsplus_put_super(struct super_block *sb) hfsplus_sync_fs(sb, 1); } + iput(sbi->alloc_file); + iput(sbi->hidden_dir); hfs_btree_close(sbi->attr_tree); hfs_btree_close(sbi->cat_tree); hfs_btree_close(sbi->ext_tree); - iput(sbi->alloc_file); - iput(sbi->hidden_dir); kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); unload_nls(sbi->nls); -- Gitee From 13ff6e1dcb2e111aea2a3b3b75e2f7a41dd58ad3 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:41:48 +0800 Subject: [PATCH 04/58] ntfs: fix use-after-free in ntfs_ucsncmp() ANBZ: #5368 commit 38c9c22a85aeed28d0831f230136e9cf6fa2ed44 upstream. Syzkaller reported use-after-free bug as follows: ================================================================== BUG: KASAN: use-after-free in ntfs_ucsncmp+0x123/0x130 Read of size 2 at addr ffff8880751acee8 by task a.out/879 CPU: 7 PID: 879 Comm: a.out Not tainted 5.19.0-rc4-next-20220630-00001-gcc5218c8bd2c-dirty #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x1c0/0x2b0 print_address_description.constprop.0.cold+0xd4/0x484 print_report.cold+0x55/0x232 kasan_report+0xbf/0xf0 ntfs_ucsncmp+0x123/0x130 ntfs_are_names_equal.cold+0x2b/0x41 ntfs_attr_find+0x43b/0xb90 ntfs_attr_lookup+0x16d/0x1e0 ntfs_read_locked_attr_inode+0x4aa/0x2360 ntfs_attr_iget+0x1af/0x220 ntfs_read_locked_inode+0x246c/0x5120 ntfs_iget+0x132/0x180 load_system_files+0x1cc6/0x3480 ntfs_fill_super+0xa66/0x1cf0 mount_bdev+0x38d/0x460 legacy_get_tree+0x10d/0x220 vfs_get_tree+0x93/0x300 do_new_mount+0x2da/0x6d0 path_mount+0x496/0x19d0 __x64_sys_mount+0x284/0x300 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f3f2118d9ea Code: 48 8b 0d a9 f4 0b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 76 f4 0b 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc269deac8 EFLAGS: 00000202 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3f2118d9ea RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffc269dec00 RBP: 00007ffc269dec80 R08: 00007ffc269deb00 R09: 00007ffc269dec44 R10: 0000000000000000 R11: 0000000000000202 R12: 000055f81ab1d220 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 The buggy address belongs to the physical page: page:0000000085430378 refcount:1 mapcount:1 mapping:0000000000000000 index:0x555c6a81d pfn:0x751ac memcg:ffff888101f7e180 anon flags: 0xfffffc00a0014(uptodate|lru|mappedtodisk|swapbacked|node=0|zone=1|lastcpupid=0x1fffff) raw: 000fffffc00a0014 ffffea0001bf2988 ffffea0001de2448 ffff88801712e201 raw: 0000000555c6a81d 0000000000000000 0000000100000000 ffff888101f7e180 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880751acd80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8880751ace00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff8880751ace80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ^ ffff8880751acf00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8880751acf80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== The reason is that struct ATTR_RECORD->name_offset is 6485, end address of name string is out of bounds. Fix this by adding sanity check on end address of attribute name string. [akpm@linux-foundation.org: coding-style cleanups] [chenxiaosong2@huawei.com: cleanup suggested by Hawkins Jiawei] Link: https://lkml.kernel.org/r/20220709064511.3304299-1-chenxiaosong2@huawei.com Link: https://lkml.kernel.org/r/20220707105329.4020708-1-chenxiaosong2@huawei.com Signed-off-by: ChenXiaoSong Signed-off-by: Hawkins Jiawei Cc: Anton Altaparmakov Cc: ChenXiaoSong Cc: Yongqiang Liu Cc: Zhang Yi Cc: Zhang Xiaoxu Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jingbo Xu Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1779 --- fs/ntfs/attrib.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 44a39a099b54..62b49197e5f6 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -606,8 +606,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, a = (ATTR_RECORD*)((u8*)ctx->attr + le32_to_cpu(ctx->attr->length)); for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || + name_end > mrec_end) break; ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || -- Gitee From 425755e268c246d7c30268cce192880b8d742998 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:41:49 +0800 Subject: [PATCH 05/58] ntfs: fix out-of-bounds read in ntfs_attr_find() ANBZ: #5368 commit 36a4d82dddbbd421d2b8e79e1cab68c8126d5075 upstream. Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find(). To ensure access on these ATTR_RECORDs are within bounds, kernel will do some checking during iteration. The problem is that during checking whether ATTR_RECORD's name is within bounds, kernel will dereferences the ATTR_RECORD name_offset field, before checking this ATTR_RECORD strcture is within bounds. This problem may result out-of-bounds read in ntfs_attr_find(), reported by Syzkaller: ================================================================== BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607 [...] Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:317 [inline] print_report.cold+0x2ba/0x719 mm/kasan/report.c:433 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854 mount_bdev+0x34d/0x410 fs/super.c:1400 legacy_get_tree+0x105/0x220 fs/fs_context.c:610 vfs_get_tree+0x89/0x2f0 fs/super.c:1530 do_new_mount fs/namespace.c:3040 [inline] path_mount+0x1326/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] The buggy address belongs to the physical page: page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350 head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140 raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== This patch solves it by moving the ATTR_RECORD strcture's bounds checking earlier, then checking whether ATTR_RECORD's name is within bounds. What's more, this patch also add some comments to improve its maintainability. Link: https://lkml.kernel.org/r/20220831160935.3409-3-yin31149@gmail.com Link: https://lore.kernel.org/all/1636796c-c85e-7f47-e96f-e074fee3c7d3@huawei.com/ Link: https://groups.google.com/g/syzkaller-bugs/c/t_XdeKPGTR4/m/LECAuIGcBgAJ Signed-off-by: chenxiaosong (A) Signed-off-by: Dan Carpenter Signed-off-by: Hawkins Jiawei Reported-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Tested-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com Cc: Anton Altaparmakov Cc: syzkaller-bugs Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman Fixes: CVE-2023-26607 Signed-off-by: Jingbo Xu Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1779 --- fs/ntfs/attrib.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 62b49197e5f6..122a9b45b350 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -608,11 +608,23 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { u8 *mrec_end = (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || - name_end > mrec_end) + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) + break; + ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || a->type == AT_END)) -- Gitee From 184f3bf1fb6688e0d1552c31b59caf25c333edff Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:41:50 +0800 Subject: [PATCH 06/58] net/sched: Retire tcindex classifier ANBZ: #4784 commit 8c710f75256bb3cf05ac7b1672c82b92c43f3d28 upstream. commit 8c710f75256bb3cf05ac7b1672c82b92c43f3d28 upstream. The tcindex classifier has served us well for about a quarter of a century but has not been getting much TLC due to lack of known users. Most recently it has become easy prey to syzkaller. For this reason, we are retiring it. Signed-off-by: Jamal Hadi Salim Acked-by: Jiri Pirko Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman Signed-off-by: Qiao Ma [Fixes conflicts] Fixes: CVE-2023-1829 Signed-off-by: None <> Signed-off-by: Xiao Long Reviewed-by: Xuan Zhuo Link: https://gitee.com/anolis/cloud-kernel/pulls/1771 --- net/sched/Kconfig | 11 - net/sched/Makefile | 1 - net/sched/cls_tcindex.c | 715 ---------------------------------------- 3 files changed, 727 deletions(-) delete mode 100644 net/sched/cls_tcindex.c diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e95741388311..4547022ed7f4 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -458,17 +458,6 @@ config NET_CLS_BASIC To compile this code as a module, choose M here: the module will be called cls_basic. -config NET_CLS_TCINDEX - tristate "Traffic-Control Index (TCINDEX)" - select NET_CLS - ---help--- - Say Y here if you want to be able to classify packets based on - traffic control indices. You will want this feature if you want - to implement Differentiated Services together with DSMARK. - - To compile this code as a module, choose M here: the - module will be called cls_tcindex. - config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" depends on INET diff --git a/net/sched/Makefile b/net/sched/Makefile index f0403f49edcb..5eed580cdb42 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -62,7 +62,6 @@ obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o -obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c deleted file mode 100644 index 7735404da572..000000000000 --- a/net/sched/cls_tcindex.c +++ /dev/null @@ -1,715 +0,0 @@ -/* - * net/sched/cls_tcindex.c Packet classifier for skb->tc_index - * - * Written 1998,1999 by Werner Almesberger, EPFL ICA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Passing parameters to the root seems to be done more awkwardly than really - * necessary. At least, u32 doesn't seem to use such dirty hacks. To be - * verified. FIXME. - */ - -#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ -#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ - - -struct tcindex_filter_result { - struct tcf_exts exts; - struct tcf_result res; - struct rcu_work rwork; -}; - -struct tcindex_filter { - u16 key; - struct tcindex_filter_result result; - struct tcindex_filter __rcu *next; - struct rcu_work rwork; -}; - - -struct tcindex_data { - struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter __rcu **h; /* imperfect hash; */ - struct tcf_proto *tp; - u16 mask; /* AND key with mask */ - u32 shift; /* shift ANDed key to the right */ - u32 hash; /* hash table size; 0 if undefined */ - u32 alloc_hash; /* allocated size */ - u32 fall_through; /* 0: only classify if explicit match */ - struct rcu_work rwork; -}; - -static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) -{ - return tcf_exts_has_actions(&r->exts) || r->res.classid; -} - -static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, - u16 key) -{ - if (p->perfect) { - struct tcindex_filter_result *f = p->perfect + key; - - return tcindex_filter_is_set(f) ? f : NULL; - } else if (p->h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *f; - - fp = &p->h[key % p->hash]; - for (f = rcu_dereference_bh_rtnl(*fp); - f; - fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) - if (f->key == key) - return &f->result; - } - - return NULL; -} - - -static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct tcindex_data *p = rcu_dereference_bh(tp->root); - struct tcindex_filter_result *f; - int key = (skb->tc_index & p->mask) >> p->shift; - - pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", - skb, tp, res, p); - - f = tcindex_lookup(p, key); - if (!f) { - struct Qdisc *q = tcf_block_q(tp->chain->block); - - if (!p->fall_through) - return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); - res->class = 0; - pr_debug("alg 0x%x\n", res->classid); - return 0; - } - *res = f->res; - pr_debug("map 0x%x\n", res->classid); - - return tcf_exts_exec(skb, &f->exts, res); -} - - -static void *tcindex_get(struct tcf_proto *tp, u32 handle) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r; - - pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); - if (p->perfect && handle >= p->alloc_hash) - return NULL; - r = tcindex_lookup(p, handle); - return r && tcindex_filter_is_set(r) ? r : NULL; -} - -static int tcindex_init(struct tcf_proto *tp) -{ - struct tcindex_data *p; - - pr_debug("tcindex_init(tp %p)\n", tp); - p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); - if (!p) - return -ENOMEM; - - p->mask = 0xffff; - p->hash = DEFAULT_HASH_SIZE; - p->fall_through = 1; - - rcu_assign_pointer(tp->root, p); - return 0; -} - -static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) -{ - tcf_exts_destroy(&r->exts); - tcf_exts_put_net(&r->exts); -} - -static void tcindex_destroy_rexts_work(struct work_struct *work) -{ - struct tcindex_filter_result *r; - - r = container_of(to_rcu_work(work), - struct tcindex_filter_result, - rwork); - rtnl_lock(); - __tcindex_destroy_rexts(r); - rtnl_unlock(); -} - -static void __tcindex_destroy_fexts(struct tcindex_filter *f) -{ - tcf_exts_destroy(&f->result.exts); - tcf_exts_put_net(&f->result.exts); - kfree(f); -} - -static void tcindex_destroy_fexts_work(struct work_struct *work) -{ - struct tcindex_filter *f = container_of(to_rcu_work(work), - struct tcindex_filter, - rwork); - - rtnl_lock(); - __tcindex_destroy_fexts(f); - rtnl_unlock(); -} - -static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, - struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = arg; - struct tcindex_filter __rcu **walk; - struct tcindex_filter *f = NULL; - - pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p); - if (p->perfect) { - if (!r->res.class) - return -ENOENT; - } else { - int i; - - for (i = 0; i < p->hash; i++) { - walk = p->h + i; - for (f = rtnl_dereference(*walk); f; - walk = &f->next, f = rtnl_dereference(*walk)) { - if (&f->result == r) - goto found; - } - } - return -ENOENT; - -found: - rcu_assign_pointer(*walk, rtnl_dereference(f->next)); - } - tcf_unbind_filter(tp, &r->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (f) { - if (tcf_exts_get_net(&f->result.exts)) - tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work); - else - __tcindex_destroy_fexts(f); - } else { - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - - *last = false; - return 0; -} - -static void tcindex_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - kfree(p->perfect); - kfree(p->h); - kfree(p); -} - -static inline int -valid_perfect_hash(struct tcindex_data *p) -{ - return p->hash > (p->mask >> p->shift); -} - -static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { - [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, - [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, - [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, - [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, - [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, -}; - -static int tcindex_filter_result_init(struct tcindex_filter_result *r) -{ - memset(r, 0, sizeof(*r)); - return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); -} - -static void tcindex_partial_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - kfree(p->perfect); - kfree(p); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp) -{ - int i; - - for (i = 0; i < cp->hash; i++) - tcf_exts_destroy(&cp->perfect[i].exts); - kfree(cp->perfect); -} - -static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) -{ - int i, err = 0; - - cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL); - if (!cp->perfect) - return -ENOMEM; - - for (i = 0; i < cp->hash; i++) { - err = tcf_exts_init(&cp->perfect[i].exts, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - goto errout; -#ifdef CONFIG_NET_CLS_ACT - cp->perfect[i].exts.net = net; -#endif - } - - return 0; - -errout: - tcindex_free_perfect_hash(cp); - return err; -} - -static int -tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, - u32 handle, struct tcindex_data *p, - struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) -{ - struct tcindex_filter_result new_filter_result, *old_r = r; - struct tcindex_data *cp = NULL, *oldp; - struct tcindex_filter *f = NULL; /* make gcc behave */ - struct tcf_result cr = {}; - int err, balloc = 0; - struct tcf_exts e; - bool update_h = false; - - err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack); - if (err < 0) - goto errout; - - err = -ENOMEM; - /* tcindex_data attributes must look atomic to classifier/lookup so - * allocate new tcindex data and RCU assign it onto root. Keeping - * perfect hash and hash pointers from old data. - */ - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) - goto errout; - - cp->mask = p->mask; - cp->shift = p->shift; - cp->hash = p->hash; - cp->alloc_hash = p->alloc_hash; - cp->fall_through = p->fall_through; - cp->tp = tp; - - if (p->perfect) { - int i; - - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout; - for (i = 0; i < cp->hash; i++) - cp->perfect[i].res = p->perfect[i].res; - balloc = 1; - } - cp->h = p->h; - - err = tcindex_filter_result_init(&new_filter_result); - if (err < 0) - goto errout1; - if (old_r) - cr = r->res; - - if (tb[TCA_TCINDEX_HASH]) - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - - if (tb[TCA_TCINDEX_MASK]) - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - - if (tb[TCA_TCINDEX_SHIFT]) - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - - err = -EBUSY; - - /* Hash already allocated, make sure that we still meet the - * requirements for the allocated hash. - */ - if (cp->perfect) { - if (!valid_perfect_hash(cp) || - cp->hash > cp->alloc_hash) - goto errout_alloc; - } else if (cp->h && cp->hash != cp->alloc_hash) { - goto errout_alloc; - } - - err = -EINVAL; - if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - - if (!cp->hash) { - /* Hash not specified, use perfect hash if the upper limit - * of the hashing index is below the threshold. - */ - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) - cp->hash = (cp->mask >> cp->shift) + 1; - else - cp->hash = DEFAULT_HASH_SIZE; - } - - if (!cp->perfect && !cp->h) - cp->alloc_hash = cp->hash; - - /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) - * but then, we'd fail handles that may become valid after some future - * mask change. While this is extremely unlikely to ever matter, - * the check below is safer (and also more backwards-compatible). - */ - if (cp->perfect || valid_perfect_hash(cp)) - if (handle >= cp->alloc_hash) - goto errout_alloc; - - - err = -ENOMEM; - if (!cp->perfect && !cp->h) { - if (valid_perfect_hash(cp)) { - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout_alloc; - balloc = 1; - } else { - struct tcindex_filter __rcu **hash; - - hash = kcalloc(cp->hash, - sizeof(struct tcindex_filter *), - GFP_KERNEL); - - if (!hash) - goto errout_alloc; - - cp->h = hash; - balloc = 2; - } - } - - if (cp->perfect) { - r = cp->perfect + handle; - } else { - /* imperfect area is updated in-place using rcu */ - update_h = !!tcindex_lookup(cp, handle); - r = &new_filter_result; - } - - if (r == &new_filter_result) { - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (!f) - goto errout_alloc; - f->key = handle; - f->next = NULL; - err = tcindex_filter_result_init(&f->result); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - if (tb[TCA_TCINDEX_CLASSID]) { - cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); - tcf_bind_filter(tp, &cr, base); - } - - if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - oldp = p; - r->res = cr; - tcf_exts_change(&r->exts, &e); - - rcu_assign_pointer(tp->root, cp); - - if (update_h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *cf; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - /* imperfect area bucket */ - fp = cp->h + (handle % cp->hash); - - /* lookup the filter, guaranteed to exist */ - for (cf = rcu_dereference_bh_rtnl(*fp); cf; - fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) - if (cf->key == handle) - break; - - f->next = cf->next; - - cf = rcu_replace_pointer(*fp, f, 1); - tcf_exts_get_net(&cf->result.exts); - tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work); - } else if (r == &new_filter_result) { - struct tcindex_filter *nfp; - struct tcindex_filter __rcu **fp; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - fp = cp->h + (handle % cp->hash); - for (nfp = rtnl_dereference(*fp); - nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) - ; /* nothing */ - - rcu_assign_pointer(*fp, f); - } else { - tcf_exts_destroy(&new_filter_result.exts); - } - - if (oldp) - tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work); - return 0; - -errout_alloc: - if (balloc == 1) - tcindex_free_perfect_hash(cp); - else if (balloc == 2) - kfree(cp->h); -errout1: - tcf_exts_destroy(&new_filter_result.exts); -errout: - kfree(cp); - tcf_exts_destroy(&e); - return err; -} - -static int -tcindex_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - struct netlink_ext_ack *extack) -{ - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = *arg; - int err; - - pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," - "p %p,r %p,*arg %p\n", - tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL); - - if (!opt) - return 0; - - err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy, NULL); - if (err < 0) - return err; - - return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], ovr, extack); -} - -static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter *f, *next; - int i; - - pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - if (!p->perfect[i].res.class) - continue; - if (walker->count >= walker->skip) { - if (walker->fn(tp, p->perfect + i, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; - } - } - if (!p->h) - return; - for (i = 0; i < p->hash; i++) { - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - if (walker->count >= walker->skip) { - if (walker->fn(tp, &f->result, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; - } - } -} - -static void tcindex_destroy(struct tcf_proto *tp, - struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - int i; - - pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); - - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - struct tcindex_filter_result *r = p->perfect + i; - - tcf_unbind_filter(tp, &r->res); - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, - tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - } - - for (i = 0; p->h && i < p->hash; i++) { - struct tcindex_filter *f, *next; - bool last; - - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - tcindex_delete(tp, &f->result, &last, NULL); - } - } - - tcf_queue_work(&p->rwork, tcindex_destroy_work); -} - - -static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = fh; - struct nlattr *nest; - - pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n", - tp, fh, skb, t, p, r); - pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); - - nest = nla_nest_start(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (!fh) { - t->tcm_handle = ~0; /* whatever ... */ - if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || - nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || - nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || - nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) - goto nla_put_failure; - nla_nest_end(skb, nest); - } else { - if (p->perfect) { - t->tcm_handle = r - p->perfect; - } else { - struct tcindex_filter *f; - struct tcindex_filter __rcu **fp; - int i; - - t->tcm_handle = 0; - for (i = 0; !t->tcm_handle && i < p->hash; i++) { - fp = &p->h[i]; - for (f = rtnl_dereference(*fp); - !t->tcm_handle && f; - fp = &f->next, f = rtnl_dereference(*fp)) { - if (&f->result == r) - t->tcm_handle = f->key; - } - } - } - pr_debug("handle = %d\n", t->tcm_handle); - if (r->res.class && - nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &r->exts) < 0) - goto nla_put_failure; - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &r->exts) < 0) - goto nla_put_failure; - } - - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl) -{ - struct tcindex_filter_result *r = fh; - - if (r && r->res.classid == classid) - r->res.class = cl; -} - -static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { - .kind = "tcindex", - .classify = tcindex_classify, - .init = tcindex_init, - .destroy = tcindex_destroy, - .get = tcindex_get, - .change = tcindex_change, - .delete = tcindex_delete, - .walk = tcindex_walk, - .dump = tcindex_dump, - .bind_class = tcindex_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_tcindex(void) -{ - return register_tcf_proto_ops(&cls_tcindex_ops); -} - -static void __exit exit_tcindex(void) -{ - unregister_tcf_proto_ops(&cls_tcindex_ops); -} - -module_init(init_tcindex) -module_exit(exit_tcindex) -MODULE_LICENSE("GPL"); -- Gitee From 505750721c37515943dbdc239c9d609533126337 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:41:51 +0800 Subject: [PATCH 07/58] kthread: Switch to cpu_possible_mask ANBZ: #5581 commit 043eb8e1051143a24811e6f35c276e35ae8247b6 upstream. Next patch will switch unbound kernel threads mask to housekeeping_cpumask(), a subset of cpu_possible_mask. So in order to ease bisection, lets first switch kthreads default affinity from cpu_all_mask to cpu_possible_mask. It looks safe to do so as cpu_possible_mask seem to be initialized at setup_arch() time, way before kthreadd is created. Suggested-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Marcelo Tosatti Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200527142909.23372-2-frederic@kernel.org Signed-off-by: Cruz Zhao Reviewed-by: Tianchen Ding Reviewed-by: Artie Ding Link: https://gitee.com/anolis/cloud-kernel/pulls/1775 --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index fcb3a1a6e14b..5ddea08da19e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -339,7 +339,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_all_mask); + set_cpus_allowed_ptr(task, cpu_possible_mask); } kfree(create); return task; @@ -564,7 +564,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_cpus_allowed_ptr(tsk, cpu_possible_mask); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; -- Gitee From 311ced57b84347bcbc79996bd1f75fb21995b613 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:41:52 +0800 Subject: [PATCH 08/58] isolcpus: Affine unbound kernel threads to housekeeping cpus ANBZ: #5581 commit 9cc5b8656892a72438ee7deb5e80f5be47643b8b upstream. This is a kernel enhancement that configures the cpu affinity of kernel threads via kernel boot option nohz_full=. When this option is specified, the cpumask is immediately applied upon kthread launch. This does not affect kernel threads that specify cpu and node. This allows CPU isolation (that is not allowing certain threads to execute on certain CPUs) without using the isolcpus=domain parameter, making it possible to enable load balancing on such CPUs during runtime (see kernel-parameters.txt). Note-1: this is based off on Wind River's patch at https://github.com/starlingx-staging/stx-integ/blob/master/kernel/kernel-std/centos/patches/affine-compute-kernel-threads.patch Difference being that this patch is limited to modifying kernel thread cpumask. Behaviour of other threads can be controlled via cgroups or sched_setaffinity. Note-2: Wind River's patch was based off Christoph Lameter's patch at https://lwn.net/Articles/565932/ with the only difference being the kernel parameter changed from kthread to kthread_cpus. Signed-off-by: Frederic Weisbecker Signed-off-by: Marcelo Tosatti Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200527142909.23372-3-frederic@kernel.org Signed-off-by: Cruz Zhao Reviewed-by: Tianchen Ding Reviewed-by: Artie Ding Link: https://gitee.com/anolis/cloud-kernel/pulls/1775 --- include/linux/sched/isolation.h | 1 + kernel/kthread.c | 6 ++++-- kernel/sched/isolation.c | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 30c84ba31a52..6d10cbc6d605 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -13,6 +13,7 @@ enum hk_flags { HK_FLAG_TICK = (1 << 4), HK_FLAG_DOMAIN = (1 << 5), HK_FLAG_WQ = (1 << 6), + HK_FLAG_KTHREAD = (1 << 8), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/kernel/kthread.c b/kernel/kthread.c index 5ddea08da19e..d18066d51bca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,6 +21,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -339,7 +340,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_possible_mask); + set_cpus_allowed_ptr(task, + housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; @@ -564,7 +566,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_possible_mask); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index da0ac89cc382..0409a879ae5f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -174,7 +174,8 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | + HK_FLAG_MISC | HK_FLAG_KTHREAD; return housekeeping_setup(str, flags); } -- Gitee From a7b619149c4a5fd11df70f2de50cd79eddee06b7 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:41:53 +0800 Subject: [PATCH 09/58] net/sched: flower: fix possible OOB write in fl_set_geneve_opt() ANBZ: #5616 commit 4d56304e5827c8cc8cc18c75343d283af7c4825c upstream. If we send two TCA_FLOWER_KEY_ENC_OPTS_GENEVE packets and their total size is 252 bytes(key->enc_opts.len = 252) then key->enc_opts.len = opt->length = data_len / 4 = 0 when the third TCA_FLOWER_KEY_ENC_OPTS_GENEVE packet enters fl_set_geneve_opt. This bypasses the next bounds check and results in an out-of-bounds. Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Signed-off-by: Hangyu Hua Reviewed-by: Simon Horman Reviewed-by: Pieter Jansen van Vuuren Link: https://lore.kernel.org/r/20230531102805.27090-1-hbh25y@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Fixes: CVE-2023-35788 Signed-off-by: Xiao Long Reviewed-by: Xuan Zhuo Link: https://gitee.com/anolis/cloud-kernel/pulls/1786 --- net/sched/cls_flower.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 09b359784629..789517fde758 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -632,6 +632,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, if (option_len > sizeof(struct geneve_opt)) data_len = option_len - sizeof(struct geneve_opt); + if (key->enc_opts.len > FLOW_DIS_TUN_OPTS_MAX - 4) + return -ERANGE; + opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len]; memset(opt, 0xff, option_len); opt->length = data_len / 4; -- Gitee From 0ddd3ccb1873c60dfc8b49c9dd50c871f9ce978e Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:41:54 +0800 Subject: [PATCH 10/58] anolis: fuse: fix array-index-out-of-bounds in fuse_update_stats() ANBZ: #5621 The following array-index-out-of-bounds is caught on cuse: UBSAN: array-index-out-of-bounds in fs/fuse/dev.c:284:46 index 4096 is out of range for type 'atomic64_t [50]' Call trace: dump_backtrace+0x0/0x3b0 show_stack+0x1c/0x2c dump_stack+0x144/0x1b8 ubsan_epilogue+0xc/0x4c __ubsan_handle_out_of_bounds+0x78/0x80 fuse_update_stats+0x148/0x164 [fuse] fuse_request_end+0x174/0x780 [fuse] end_requests+0x128/0x1d0 [fuse] fuse_abort_conn+0x7a8/0x9f0 [fuse] fuse_dev_release+0x220/0x3e0 [fuse] cuse_channel_release+0x1d0/0x270 [cuse] __fput+0x1ac/0x6b4 ____fput+0x14/0x20 task_work_run+0xd8/0x1f0 do_exit+0x530/0xc9c do_group_exit+0xd0/0x240 __arm64_sys_exit_group+0x40/0x50 el0_svc_common+0x154/0x520 do_el0_svc+0xac/0xd4 el0_svc+0x1c/0x30 el0_sync_handler+0xa8/0xac el0_sync+0x168/0x180 This is because the cuse device node shares the same file_operations, i.e. fuse_dev_operations, with the fuse device node, and fuse_update_stats() is called on CUSE_INIT (opcode 4096) message. Fix this by only allowing fuse_update_stats() on normal fuse messages. Fixes: a88ac284ae44 ("anolis: fuse: add fusectl interface to stats") Signed-off-by: Jingbo Xu Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1790 --- fs/fuse/dev.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 893fcf28e308..7dff2da7434d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -411,17 +411,20 @@ static void flush_bg_queue(struct fuse_conn *fc) } } -static void fuse_update_stats(struct fuse_conn *fc, int opcode, uint64_t send_time) +static void fuse_update_stats(struct fuse_conn *fc, struct fuse_req *req) { + unsigned int opcode = req->in.h.opcode; uint64_t delta_time; - delta_time = get_time_now_us() - send_time; + if (opcode < FUSE_OP_MAX) { + delta_time = get_time_now_us() - req->send_time; - atomic64_add(delta_time, &fc->stats.req_time[FUSE_SUMMARY]); - atomic64_add(delta_time, &fc->stats.req_time[opcode]); + atomic64_add(delta_time, &fc->stats.req_time[FUSE_SUMMARY]); + atomic64_add(delta_time, &fc->stats.req_time[opcode]); - atomic64_inc(&fc->stats.req_cnts[FUSE_SUMMARY]); - atomic64_inc(&fc->stats.req_cnts[opcode]); + atomic64_inc(&fc->stats.req_cnts[FUSE_SUMMARY]); + atomic64_inc(&fc->stats.req_cnts[opcode]); + } } /* @@ -445,7 +448,7 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { - fuse_update_stats(fc, req->in.h.opcode, req->send_time); + fuse_update_stats(fc, req); spin_lock(&fc->bg_lock); clear_bit(FR_BACKGROUND, &req->flags); @@ -563,7 +566,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); - fuse_update_stats(fc, req->in.h.opcode, req->send_time); + fuse_update_stats(fc, req); } } -- Gitee From 75728666b9b34a6a2f8972bc4bc7905c46c0d18e Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:34 +0800 Subject: [PATCH 11/58] virtio-blk: Don't use MAX_DISCARD_SEGMENTS if max_discard_seg is zero ANBZ: #5630 commit dacc73ed0b88f1a787ec20385f42ca9dd9eddcd0 upstream Currently the value of max_discard_segment will be set to MAX_DISCARD_SEGMENTS (256) with no basis in hardware if device set 0 to max_discard_seg in configuration space. It's incorrect since the device might not be able to handle such large descriptors. To fix it, let's follow max_segments restrictions in this case. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20220304100058.116-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Fupan Li Reviewed-by: Jingbo Xu Acked-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1794 --- drivers/block/virtio_blk.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 91e2487ba6de..cd6d9fed1707 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -976,9 +976,15 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, &v); + + /* + * max_discard_seg == 0 is out of spec but we always + * handled it. + */ + if (!v) + v = sg_elems - 2; blk_queue_max_discard_segments(q, - min_not_zero(v, - MAX_DISCARD_SEGMENTS)); + min(v, MAX_DISCARD_SEGMENTS)); blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } -- Gitee From 98dac39f634a9534088fb0265365b44595ef9c30 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:35 +0800 Subject: [PATCH 12/58] virtio_blk: fix the discard_granularity and discard_alignment queue limits ANBZ: #5630 commit 62952cc5bccd89b76d710de1d0b43244af0f2903 upstream The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. On the other hand the discard_sector_alignment from the virtio 1.1 looks similar to what Linux uses as discard granularity (even if not very well described): "discard_sector_alignment can be used by OS when splitting a request based on alignment. " And at least qemu does set it to the discard granularity. So stop setting the discard_alignment and use the virtio discard_sector_alignment to set the discard granularity. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-5-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Fupan Li Reviewed-by: Jingbo Xu Acked-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1794 --- drivers/block/virtio_blk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index cd6d9fed1707..57dc29b09f74 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -964,11 +964,12 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_io_opt(q, blk_size * opt_io_size); if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { - q->limits.discard_granularity = blk_size; - virtio_cread(vdev, struct virtio_blk_config, discard_sector_alignment, &v); - q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0; + if (v) + q->limits.discard_granularity = v << SECTOR_SHIFT; + else + q->limits.discard_granularity = blk_size; virtio_cread(vdev, struct virtio_blk_config, max_discard_sectors, &v); -- Gitee From 57256e903000bbeea871aa0254b0aced4216b2b0 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:36 +0800 Subject: [PATCH 13/58] anolis: virtio-blk: skip BUG_ON() in virtio_queue_rq() for discard command ANBZ: #5630 Currently we have a BUG_ON() to make sure the number of sg list does not exceed queue_max_segments() in virtio_queue_rq(). However, the block layer uses queue_max_discard_segments() instead of queue_max_segments() to limit the sg list for discard requests. So the BUG_ON() might be triggered if virtio-blk device reports a larger value for max discard segment than queue_max_segments(). It is fixed with upstream commit e030759a1ddc ("virtio-blk: Remove BUG_ON() in virtio_queue_rq()"). In order to reduce the code conflicts introduced by the prerequisite commit when backporting this fix, let's simply skip this BUG_ON() check for discard command. Signed-off-by: Fupan Li Reviewed-by: Jingbo Xu Acked-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1794 --- drivers/block/virtio_blk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 57dc29b09f74..500d392f0e7e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -304,7 +304,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, bool unmap = false; u32 type; - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); + if (req_op(req) != REQ_OP_DISCARD) + BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); switch (req_op(req)) { case REQ_OP_READ: -- Gitee From 8a390bd08410f99a4e3312e9db721e87a8d3d603 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:37 +0800 Subject: [PATCH 14/58] anolis: Revert "anolis: erofs,ovl: enable RCU'd ->get_acl()" ANBZ: #5670 This reverts commit 5943818023a821fa546e73c0bedc05a34299d910, where the optimization is enabled only when lowerdirs are upon limited fs type. Later we will cherry-pick the upstream commit 332f606b32b6 ("ovl: enable RCU'd ->get_acl()") to re-enable this optimization. Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Acked-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1802 --- fs/erofs/super.c | 2 -- fs/overlayfs/inode.c | 11 +++-------- fs/overlayfs/ovl_entry.h | 1 - fs/overlayfs/super.c | 1 - fs/posix_acl.c | 13 +------------ include/linux/fs.h | 6 ------ include/linux/posix_acl.h | 3 +-- 7 files changed, 5 insertions(+), 32 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 29f1b22826bc..3f014bffc9dd 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -757,8 +757,6 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent) else sb->s_iflags &= ~SB_I_OVL_OPT_CREDS; - sb->s_iflags |= SB_I_OVL_OPT_ACL_RCU; - #ifdef CONFIG_EROFS_FS_ZIP INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); #endif diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b7f9ad289ab7..6c42823251fe 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "overlayfs.h" @@ -450,19 +449,15 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); - struct ovl_fs *ofs = inode->i_sb->s_fs_info; const struct cred *old_cred; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - if (rcu) { - if (!ofs->config.opt_acl_rcu) - return ERR_PTR(-ECHILD); - return get_cached_acl_rcu(realinode, type); - } - old_cred = ovl_override_creds_opt(inode->i_sb); acl = get_acl(realinode, type); ovl_revert_creds(old_cred); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index ff76cda8e1be..6f8aaf4fdb9b 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -23,7 +23,6 @@ struct ovl_config { bool userxattr; bool ovl_volatile; bool opt_creds; - bool opt_acl_rcu; }; struct ovl_sb { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index c30ba99a4ff7..bbac42ea331d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1914,7 +1914,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= SB_POSIXACL; ofs->config.opt_creds = ovl_config_opt(ofs, SB_I_OVL_OPT_CREDS); - ofs->config.opt_acl_rcu = ovl_config_opt(ofs, SB_I_OVL_OPT_ACL_RCU); err = -ENOMEM; root_dentry = ovl_get_root(sb, upperpath.dentry, oe); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 2c889d2ff030..5078aa71bde8 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -21,7 +21,6 @@ #include #include #include -#include static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -56,17 +55,7 @@ EXPORT_SYMBOL(get_cached_acl); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) { - struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type)); - - if (acl == ACL_DONT_CACHE) { - struct posix_acl *ret; - - ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU); - if (!IS_ERR(ret)) - acl = ret; - } - - return acl; + return rcu_dereference(*acl_by_type(inode, type)); } EXPORT_SYMBOL(get_cached_acl_rcu); diff --git a/include/linux/fs.h b/include/linux/fs.h index 34e15e9dd6f5..0308d24a5c59 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -593,11 +593,6 @@ static inline void mapping_allow_writable(struct address_space *mapping) struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) -/* - * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to - * cache the ACL. This also means that ->get_acl() can be called in RCU mode - * with the LOOKUP_RCU flag. - */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * @@ -1396,7 +1391,6 @@ extern int send_sigurg(struct fown_struct *fown); /* hint from lowerfs for overlayfs optimizations (e.g. for container scenarios) */ #define SB_I_OVL_OPT_CREDS 0x40000000 /* bypass [override|revert]_creds */ -#define SB_I_OVL_OPT_ACL_RCU 0x80000000 /* enable RCU'd ACL */ /* Possible states of 'frozen' field */ enum { diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 0c842d07091a..540595a321a7 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -71,8 +71,6 @@ extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct inode *, int, struct posix_acl *); -struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); - #ifdef CONFIG_FS_POSIX_ACL extern int posix_acl_chmod(struct inode *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, @@ -83,6 +81,7 @@ extern int simple_set_acl(struct inode *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); -- Gitee From 8e63db438a5b3bc2676a2ae5028503d1555e6513 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:39 +0800 Subject: [PATCH 15/58] anolis: Revert "anolis: erofs,ovl: bypass [override|revert]_creds for overlayfs" ANBZ: #5670 This reverts commit ca92036c0712a2bf70289e0a28307359881a5c46. The current implementation is risky and may cause copy-up failure in specific scenarios with "permission denied". As this optimization is not practically used, revert it for now. Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Acked-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1802 --- fs/erofs/internal.h | 5 ----- fs/erofs/super.c | 25 ------------------------- fs/overlayfs/inode.c | 8 ++++---- fs/overlayfs/overlayfs.h | 2 -- fs/overlayfs/ovl_entry.h | 1 - fs/overlayfs/super.c | 18 ------------------ fs/overlayfs/util.c | 15 --------------- include/linux/fs.h | 3 --- 8 files changed, 4 insertions(+), 73 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1ef0dfb3d208..89eb85b94930 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -147,11 +147,6 @@ struct erofs_sb_info { /* Mount flags set via mount options or defaults */ #define EROFS_MOUNT_XATTR_USER 0x00000010 #define EROFS_MOUNT_POSIX_ACL 0x00000020 -/* - * Bypass [override_creds|revert_creds] for overlayfs - * when erofs is mounted as lowerfs. - */ -#define EROFS_MOUNT_OPT_CREDS 0x80000000 #define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3f014bffc9dd..e6ba90a0adc4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -382,8 +382,6 @@ enum { Opt_domain_id, Opt_bootstrap_path, Opt_blob_dir_path, - Opt_opt_creds_on, - Opt_opt_creds_off, Opt_err }; @@ -398,8 +396,6 @@ static match_table_t erofs_tokens = { {Opt_domain_id, "domain_id=%s"}, {Opt_bootstrap_path, "bootstrap_path=%s"}, {Opt_blob_dir_path, "blob_dir_path=%s"}, - {Opt_opt_creds_on, "opt_creds=on"}, - {Opt_opt_creds_off, "opt_creds=off"}, {Opt_err, NULL} }; @@ -514,12 +510,6 @@ static int erofs_parse_options(struct super_block *sb, char *options) return -ENOMEM; erofs_dbg("RAFS bootstrap_path %s", sbi->bootstrap_path); break; - case Opt_opt_creds_on: - set_opt(EROFS_SB(sb), OPT_CREDS); - break; - case Opt_opt_creds_off: - clear_opt(EROFS_SB(sb), OPT_CREDS); - break; default: erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; @@ -752,11 +742,6 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent) else sb->s_flags &= ~SB_POSIXACL; - if (test_opt(sbi, OPT_CREDS)) - sb->s_iflags |= SB_I_OVL_OPT_CREDS; - else - sb->s_iflags &= ~SB_I_OVL_OPT_CREDS; - #ifdef CONFIG_EROFS_FS_ZIP INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); #endif @@ -1037,11 +1022,6 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) if (sbi->domain_id) seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif - if (test_opt(sbi, OPT_CREDS)) - seq_puts(seq, ",opt_creds=on"); - else - seq_puts(seq, ",opt_creds=off"); - return 0; } @@ -1061,11 +1041,6 @@ static int erofs_remount(struct super_block *sb, int *flags, char *data) else sb->s_flags &= ~SB_POSIXACL; - if (test_opt(sbi, OPT_CREDS)) - sb->s_iflags |= SB_I_OVL_OPT_CREDS; - else - sb->s_iflags &= ~SB_I_OVL_OPT_CREDS; - *flags |= SB_RDONLY; return 0; out: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 6c42823251fe..25c2c3513347 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -299,7 +299,7 @@ int ovl_permission(struct inode *inode, int mask) if (err) return err; - old_cred = ovl_override_creds_opt(inode->i_sb); + old_cred = ovl_override_creds(inode->i_sb); if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); @@ -307,7 +307,7 @@ int ovl_permission(struct inode *inode, int mask) mask |= MAY_READ; } err = inode_permission(realinode, mask); - ovl_revert_creds(old_cred); + revert_creds(old_cred); return err; } @@ -458,9 +458,9 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - old_cred = ovl_override_creds_opt(inode->i_sb); + old_cred = ovl_override_creds(inode->i_sb); acl = get_acl(realinode, type); - ovl_revert_creds(old_cred); + revert_creds(old_cred); return acl; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e932bd59fdad..08ee430e4f64 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -247,8 +247,6 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); -const struct cred *ovl_override_creds_opt(struct super_block *sb); -void ovl_revert_creds(const struct cred *oldcred); int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 6f8aaf4fdb9b..d9f5a4cd3938 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -22,7 +22,6 @@ struct ovl_config { bool metacopy; bool userxattr; bool ovl_volatile; - bool opt_creds; }; struct ovl_sb { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bbac42ea331d..a79ff8f6afd8 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1774,22 +1774,6 @@ static struct dentry *ovl_get_root(struct super_block *sb, return root; } -static bool ovl_config_opt(struct ovl_fs *ofs, unsigned int opt_mask) -{ - int i; - bool opt = true; - - /* - * Enable the optimization for container scenarios if all lowerfs are - * configured with opt_mask. The optimization is disabled by default. - * ofs->numfs must be at least 2, thus the default "true" won't be - * returned without checking any lowerfs. - */ - for (i = 1; opt && i < ofs->numfs; i++) - opt = ofs->fs[i].sb->s_iflags & opt_mask; - return opt; -} - static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { }; @@ -1913,8 +1897,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = ofs; sb->s_flags |= SB_POSIXACL; - ofs->config.opt_creds = ovl_config_opt(ofs, SB_I_OVL_OPT_CREDS); - err = -ENOMEM; root_dentry = ovl_get_root(sb, upperpath.dentry, oe); if (!root_dentry) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 138e864c07c2..681a9a47975b 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -43,21 +43,6 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } -const struct cred *ovl_override_creds_opt(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - if (ofs->config.opt_creds) - return NULL; - return override_creds(ofs->creator_cred); -} - -void ovl_revert_creds(const struct cred *old_cred) -{ - if (old_cred) - revert_creds(old_cred); -} - /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. diff --git a/include/linux/fs.h b/include/linux/fs.h index 0308d24a5c59..6a224d11c53f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1389,9 +1389,6 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 -/* hint from lowerfs for overlayfs optimizations (e.g. for container scenarios) */ -#define SB_I_OVL_OPT_CREDS 0x40000000 /* bypass [override|revert]_creds */ - /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ -- Gitee From 193b0613a695c66b7a76d502ece1d4f05997e2cc Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:40 +0800 Subject: [PATCH 16/58] ovl: enable RCU'd ->get_acl() ANBZ: #5670 commit 332f606b32b6291a944c8cf23b91f53a6e676525 upstream. Overlayfs does not cache ACL's (to avoid double caching). Instead it just calls the underlying filesystem's i_op->get_acl(), which will return the cached value, if possible. In rcu path walk, however, get_cached_acl_rcu() is employed to get the value from the cache, which will fail on overlayfs resulting in dropping out of rcu walk mode. This can result in a big performance hit in certain situations. Fix by calling ->get_acl() with rcu=true in case of ACL_DONT_CACHE (which indicates pass-through) Reported-by: garyhuang Signed-off-by: Miklos Szeredi Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Acked-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1802 --- fs/overlayfs/inode.c | 7 ++++--- fs/posix_acl.c | 13 ++++++++++++- include/linux/fs.h | 5 +++++ include/linux/posix_acl.h | 3 ++- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 25c2c3513347..477f6d521390 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "overlayfs.h" @@ -452,12 +453,12 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) const struct cred *old_cred; struct posix_acl *acl; - if (rcu) - return ERR_PTR(-ECHILD); - if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; + if (rcu) + return get_cached_acl_rcu(realinode, type); + old_cred = ovl_override_creds(inode->i_sb); acl = get_acl(realinode, type); revert_creds(old_cred); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5078aa71bde8..2c889d2ff030 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -55,7 +56,17 @@ EXPORT_SYMBOL(get_cached_acl); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) { - return rcu_dereference(*acl_by_type(inode, type)); + struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type)); + + if (acl == ACL_DONT_CACHE) { + struct posix_acl *ret; + + ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU); + if (!IS_ERR(ret)) + acl = ret; + } + + return acl; } EXPORT_SYMBOL(get_cached_acl_rcu); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a224d11c53f..5b7b7f31b215 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -593,6 +593,11 @@ static inline void mapping_allow_writable(struct address_space *mapping) struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) +/* + * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to + * cache the ACL. This also means that ->get_acl() can be called in RCU mode + * with the LOOKUP_RCU flag. + */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 540595a321a7..0c842d07091a 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -71,6 +71,8 @@ extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct inode *, int, struct posix_acl *); +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); + #ifdef CONFIG_FS_POSIX_ACL extern int posix_acl_chmod(struct inode *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, @@ -81,7 +83,6 @@ extern int simple_set_acl(struct inode *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); -struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); -- Gitee From 8ca7e8349093d1a094d2a2d514c5d0462982b255 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:41 +0800 Subject: [PATCH 17/58] net: add sock_init_data_uid() ANBZ: #4278 commit 584f3742890e966d2f0a1f3c418c9ead70b2d99e upstream. Add sock_init_data_uid() to explicitly initialize the socket uid. To initialise the socket uid, sock_init_data() assumes a the struct socket* sock is always embedded in a struct socket_alloc, used to access the corresponding inode uid. This may not be true. Examples are sockets created in tun_chr_open() and tap_open(). Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Signed-off-by: Pietro Borrello Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Fixes: CVE-2023-1076 Signed-off-by: Heng Qi Reviewed-by: Xuan Zhuo Link: https://gitee.com/anolis/cloud-kernel/pulls/1821 --- include/net/sock.h | 7 ++++++- net/core/sock.c | 15 ++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7e39d72d0a90..9382a53c2f6f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1682,7 +1682,12 @@ void sk_common_release(struct sock *sk); * Default socket callbacks and setup code */ -/* Initialise core socket variables */ +/* Initialise core socket variables using an explicit uid. */ +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); + +/* Initialise core socket variables. + * Assumes struct socket *sock is embedded in a struct socket_alloc. + */ void sock_init_data(struct socket *sock, struct sock *sk); /* diff --git a/net/core/sock.c b/net/core/sock.c index 0b370f5cc23e..d4f24066eb12 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2769,7 +2769,7 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer) } EXPORT_SYMBOL(sk_stop_timer); -void sock_init_data(struct socket *sock, struct sock *sk) +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; @@ -2788,11 +2788,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; sk->sk_wq = sock->wq; sock->sk = sk; - sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { sk->sk_wq = NULL; - sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } + sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) @@ -2850,6 +2849,16 @@ void sock_init_data(struct socket *sock, struct sock *sk) refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } +EXPORT_SYMBOL(sock_init_data_uid); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + kuid_t uid = sock ? + SOCK_INODE(sock)->i_uid : + make_kuid(sock_net(sk)->user_ns, 0); + + sock_init_data_uid(sock, sk, uid); +} EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) -- Gitee From b0e0e7d39c512e34889b03e007cb76c41a935954 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:42 +0800 Subject: [PATCH 18/58] tun: tun_chr_open(): correctly initialize socket uid ANBZ: #4278 commit a096ccca6e503a5c575717ff8a36ace27510ab0a upstream. sock_init_data() assumes that the `struct socket` passed in input is contained in a `struct socket_alloc` allocated with sock_alloc(). However, tun_chr_open() passes a `struct socket` embedded in a `struct tun_file` allocated with sk_alloc(). This causes a type confusion when issuing a container_of() with SOCK_INODE() in sock_init_data() which results in assigning a wrong sk_uid to the `struct sock` in input. On default configuration, the type confused field overlaps with the high 4 bytes of `struct tun_struct __rcu *tun` of `struct tun_file`, NULL at the time of call, which makes the uid of all tun sockets 0, i.e., the root one. Fix the assignment by using sock_init_data_uid(). Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Signed-off-by: Pietro Borrello Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Fixes: CVE-2023-1076 Signed-off-by: Heng Qi Reviewed-by: Xuan Zhuo Link: https://gitee.com/anolis/cloud-kernel/pulls/1821 --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e1ac1c57089f..01758114b458 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3237,7 +3237,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; - sock_init_data(&tfile->socket, &tfile->sk); + sock_init_data_uid(&tfile->socket, &tfile->sk, inode->i_uid); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; -- Gitee From b146d3a41ee817194c3fc630e4ac166c7c84d285 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:43 +0800 Subject: [PATCH 19/58] tap: tap_open(): correctly initialize socket uid ANBZ: #4278 commit 66b2c338adce580dfce2199591e65e2bab889cff upstream. sock_init_data() assumes that the `struct socket` passed in input is contained in a `struct socket_alloc` allocated with sock_alloc(). However, tap_open() passes a `struct socket` embedded in a `struct tap_queue` allocated with sk_alloc(). This causes a type confusion when issuing a container_of() with SOCK_INODE() in sock_init_data() which results in assigning a wrong sk_uid to the `struct sock` in input. On default configuration, the type confused field overlaps with padding bytes between `int vnet_hdr_sz` and `struct tap_dev __rcu *tap` in `struct tap_queue`, which makes the uid of all tap sockets 0, i.e., the root one. Fix the assignment by using sock_init_data_uid(). Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Signed-off-by: Pietro Borrello Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Fixes: CVE-2023-1076 Signed-off-by: Heng Qi Reviewed-by: Xuan Zhuo Link: https://gitee.com/anolis/cloud-kernel/pulls/1821 --- drivers/net/tap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index f0f7cd977667..43012a195b71 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -525,7 +525,7 @@ static int tap_open(struct inode *inode, struct file *file) q->sock.state = SS_CONNECTED; q->sock.file = file; q->sock.ops = &tap_socket_ops; - sock_init_data(&q->sock, &q->sk); + sock_init_data_uid(&q->sock, &q->sk, inode->i_uid); q->sk.sk_write_space = tap_sock_write_space; q->sk.sk_destruct = tap_sock_destruct; q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; -- Gitee From f7e1de5053df504d0e570afaecb1b9b9f211d4b2 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:44 +0800 Subject: [PATCH 20/58] xfs: bulkstat should copy lastip whenever userspace supplies one ANBZ: #1095 commit f16fe3ecde6237256eeacb3cd582217ffe62c647 upstream When userspace passes in a @lastip pointer we should copy the results back, even if the @ocount pointer is NULL. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Ferry Meng Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1822 --- fs/xfs/xfs_ioctl.c | 13 ++++++------- fs/xfs/xfs_ioctl32.c | 13 ++++++------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d7f3c03432e6..b8df9945a9f0 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -800,14 +800,13 @@ xfs_ioc_bulkstat( if (error) return error; - if (bulkreq.ocount != NULL) { - if (copy_to_user(bulkreq.lastip, &inlast, - sizeof(xfs_ino_t))) - return -EFAULT; + if (bulkreq.lastip != NULL && + copy_to_user(bulkreq.lastip, &inlast, sizeof(xfs_ino_t))) + return -EFAULT; - if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) - return -EFAULT; - } + if (bulkreq.ocount != NULL && + copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -EFAULT; return 0; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index b044f7d36782..71a8874a579b 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -313,14 +313,13 @@ xfs_compat_ioc_bulkstat( if (error) return error; - if (bulkreq.ocount != NULL) { - if (copy_to_user(bulkreq.lastip, &inlast, - sizeof(xfs_ino_t))) - return -EFAULT; + if (bulkreq.lastip != NULL && + copy_to_user(bulkreq.lastip, &inlast, sizeof(xfs_ino_t))) + return -EFAULT; - if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) - return -EFAULT; - } + if (bulkreq.ocount != NULL && + copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -EFAULT; return 0; } -- Gitee From c5dfbe42618355531e8c289471a84a67df65b3ef Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:45 +0800 Subject: [PATCH 21/58] ovl: add splice file read write helper ANBZ: #1217 commit 1a980b8cbf0059a5308eea61522f232fd03002e2 upstream Now overlayfs falls back to use default file splice read and write, which is not compatiple with overlayfs, returning EFAULT. xfstests generic/591 can reproduce part of this. Tested this patch with xfstests auto group tests. Signed-off-by: Murphy Zhou Signed-off-by: Miklos Szeredi Signed-off-by: Ferry Meng Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1823 --- fs/overlayfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index db3dd10632bf..e089d6f40986 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include "overlayfs.h" struct ovl_aio_req { @@ -397,6 +400,48 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; } +static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct fd real; + const struct cred *old_cred; + + ret = ovl_real_fdget(in, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(in)->i_sb); + ret = generic_file_splice_read(real.file, ppos, pipe, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(in); + fdput(real); + return ret; +} + +static ssize_t +ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + ret = ovl_real_fdget(out, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(out)->i_sb); + ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(out); + fdput(real); + return ret; +} + static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -750,6 +795,8 @@ const struct file_operations ovl_file_operations = { .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, + .splice_read = ovl_splice_read, + .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .clone_file_range = ovl_clone_file_range, -- Gitee From c60efdd25a5a03b4474862850baf14f9b2b11a9f Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:45 +0800 Subject: [PATCH 22/58] locks: Use inode_is_open_for_write ANBZ: #1216 commit 052b8cfa4070caa53125cd589da0cfe744132a94 upstream Use the aptly named function rather than open coding it. No functional changes. Signed-off-by: Nikolay Borisov Signed-off-by: Jeff Layton Signed-off-by: Ferry Meng Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1824 --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index 552476d6f6bb..d21cfa1305bf 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1646,7 +1646,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + if ((arg == F_RDLCK) && inode_is_open_for_write(inode)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || -- Gitee From 7e977b7a7c07356c7b35a7f479c8e82c5d19d74e Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:46 +0800 Subject: [PATCH 23/58] locks: eliminate false positive conflicts for write lease ANBZ: #1216 commit 387e3746d01c34457d6a73688acd90428725070b upstream check_conflicting_open() is checking for existing fd's open for read or for write before allowing to take a write lease. The check that was implemented using i_count and d_count is an approximation that has several false positives. For example, overlayfs since v4.19, takes an extra reference on the dentry; An open with O_PATH takes a reference on the dentry although the file cannot be read nor written. Change the implementation to use i_readcount and i_writecount to eliminate the false positive conflicts and allow a write lease to be taken on an overlayfs file. The change of behavior with existing fd's open with O_PATH is symmetric w.r.t. current behavior of lease breakers - an open with O_PATH currently does not break a write lease. This increases the size of struct inode by 4 bytes on 32bit archs when CONFIG_FILE_LOCKING is defined and CONFIG_IMA was not already defined. Signed-off-by: Amir Goldstein Signed-off-by: Jeff Layton Signed-off-by: Ferry Meng Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/1824 --- fs/locks.c | 46 +++++++++++++++++++++++++++++----------------- include/linux/fs.h | 4 ++-- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index d21cfa1305bf..21b05a053f33 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1627,10 +1627,10 @@ int fcntl_getlease(struct file *filp) } /** - * check_conflicting_open - see if the given dentry points to a file that has - * an existing open that would conflict with the - * desired lease. - * @dentry: dentry to check + * check_conflicting_open - see if the given file points to an inode that has + * an existing open that would conflict with the + * desired lease. + * @filp: file to check * @arg: type of lease that we're trying to acquire * @flags: current lock flags * @@ -1638,30 +1638,42 @@ int fcntl_getlease(struct file *filp) * conflict with the lease we're trying to set. */ static int -check_conflicting_open(const struct dentry *dentry, const long arg, int flags) +check_conflicting_open(struct file *filp, const long arg, int flags) { - int ret = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = locks_inode(filp); + int self_wcount = 0, self_rcount = 0; if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && inode_is_open_for_write(inode)) - return -EAGAIN; + if (arg == F_RDLCK) + return inode_is_open_for_write(inode) ? -EAGAIN : 0; + else if (arg != F_WRLCK) + return 0; + + /* + * Make sure that only read/write count is from lease requestor. + * Note that this will result in denying write leases when i_writecount + * is negative, which is what we want. (We shouldn't grant write leases + * on files open for execution.) + */ + if (filp->f_mode & FMODE_WRITE) + self_wcount = 1; + else if (filp->f_mode & FMODE_READ) + self_rcount = 1; - if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || - (atomic_read(&inode->i_count) > 1))) - ret = -EAGAIN; + if (atomic_read(&inode->i_writecount) != self_wcount || + atomic_read(&inode->i_readcount) != self_rcount) + return -EAGAIN; - return ret; + return 0; } static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1696,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - error = check_conflicting_open(dentry, arg, lease->fl_flags); + error = check_conflicting_open(filp, arg, lease->fl_flags); if (error) goto out; @@ -1753,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * precedes these checks. */ smp_mb(); - error = check_conflicting_open(dentry, arg, lease->fl_flags); + error = check_conflicting_open(filp, arg, lease->fl_flags); if (error) { locks_unlink_lock_ctx(lease); goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b7b7f31b215..12ab65298082 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -701,7 +701,7 @@ struct inode { atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; -#ifdef CONFIG_IMA +#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ @@ -2926,7 +2926,7 @@ static inline bool inode_is_open_for_write(const struct inode *inode) return atomic_read(&inode->i_writecount) > 0; } -#ifdef CONFIG_IMA +#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { BUG_ON(!atomic_read(&inode->i_readcount)); -- Gitee From 31e8bdb362a5335f644a4504d4186dc25866123e Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:47 +0800 Subject: [PATCH 24/58] memstick: r592: Fix UAF bug in r592_remove due to race condition ANBZ: #5685 commit 63264422785021704c39b38f65a78ab9e4a186d7 upstream. In r592_probe, dev->detect_timer was bound with r592_detect_timer. In r592_irq function, the timer function will be invoked by mod_timer. If we remove the module which will call hantro_release to make cleanup, there may be a unfinished work. The possible sequence is as follows, which will cause a typical UAF bug. Fix it by canceling the work before cleanup in r592_remove. CPU0 CPU1 |r592_detect_timer r592_remove | memstick_free_host| put_device; | kfree(host); | | | queue_work | &host->media_checker //use Signed-off-by: Zheng Wang Link: https://lore.kernel.org/r/20230307164338.1246287-1-zyytlz.wz@163.com Signed-off-by: Ulf Hansson Fixes: CVE-2023-35825 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1820 --- drivers/memstick/host/r592.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index 627d6e62fe31..61b9c97cf0e7 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c @@ -827,7 +827,7 @@ static void r592_remove(struct pci_dev *pdev) /* Stop the processing thread. That ensures that we won't take any more requests */ kthread_stop(dev->io_thread); - + del_timer_sync(&dev->detect_timer); r592_enable_device(dev, false); while (!error && dev->req) { -- Gitee From 0e5daacc9d045688c226034055b5f03d10205d12 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:48 +0800 Subject: [PATCH 25/58] drm/msm/dpu: Add check for pstates ANBZ: #5686 commit 93340e10b9c5fc86730d149636e0aa8b47bb5a34 upstream. As kzalloc may fail and return NULL pointer, it should be better to check pstates in order to avoid the NULL pointer dereference. Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Signed-off-by: Jiasheng Jiang Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/514160/ Link: https://lore.kernel.org/r/20221206080236.43687-1-jiasheng@iscas.ac.cn Signed-off-by: Dmitry Baryshkov Signed-off-by: Sasha Levin Fixes: CVE-2023-3220 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1818 --- drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c index 4752f08f0884..5852e1d356e1 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c @@ -1477,6 +1477,8 @@ static int dpu_crtc_atomic_check(struct drm_crtc *crtc, } pstates = kzalloc(sizeof(*pstates) * DPU_STAGE_MAX * 4, GFP_KERNEL); + if (!pstates) + return -ENOMEM; dpu_crtc = to_dpu_crtc(crtc); cstate = to_dpu_crtc_state(state); -- Gitee From 41e59f04eeda2a890143029c2edb0aed6459bcc0 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:49 +0800 Subject: [PATCH 26/58] x86/bugs: Flush IBP in ib_prctl_set() ANBZ: #5369 commit a664ec9158eeddd75121d39c9a0758016097fa96 upstream. We missed the window between the TIF flag update and the next reschedule. Signed-off-by: Rodrigo Branco Reviewed-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Cc: Fixes: CVE-2023-0045 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1741 --- arch/x86/kernel/cpu/bugs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index fe960ebd54a5..00dd932d386a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1589,6 +1589,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); task_update_spec_tif(task); + if (task == current) + indirect_branch_prediction_barrier(); break; default: return -ERANGE; -- Gitee From a536012d0a76e2229e223f015635304128c1768d Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:50 +0800 Subject: [PATCH 27/58] drm/vmwgfx: Validate the box size for the snooped cursor ANBZ: #3944 commit 4cf949c7fafe21e085a4ee386bb2dade9067316e upstream. Invalid userspace dma surface copies could potentially overflow the memcpy from the surface to the snooped image leading to crashes. To fix it the dimensions of the copybox have to be validated against the expected size of the snooped cursor. Signed-off-by: Zack Rusin Fixes: 2ac863719e51 ("vmwgfx: Snoop DMA transfers with non-covering sizes") Cc: # v3.2+ Reviewed-by: Michael Banack Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20221026031936.1004280-1-zack@kde.org Signed-off-by: Greg Kroah-Hartman Fixes: CVE-2022-36280 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1736 --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 248d92c85cf6..befef87e0bb8 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -179,7 +179,8 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf, if (cmd->dma.guest.ptr.offset % PAGE_SIZE || box->x != 0 || box->y != 0 || box->z != 0 || box->srcx != 0 || box->srcy != 0 || box->srcz != 0 || - box->d != 1 || box_count != 1) { + box->d != 1 || box_count != 1 || + box->w > 64 || box->h > 64) { /* TODO handle none page aligned offsets */ /* TODO handle more dst & src != 0 */ /* TODO handle more then one copy */ -- Gitee From 8372d66270354caf740a832fd096fe2943450bf4 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:51 +0800 Subject: [PATCH 28/58] i2c: xgene-slimpro: Fix out-of-bounds bug in xgene_slimpro_i2c_xfer() ANBZ: #4901 commit 92fbb6d1296f81f41f65effd7f5f8c0f74943d15 upstream. The data->block[0] variable comes from user and is a number between 0-255. Without proper check, the variable may be very large to cause an out-of-bounds when performing memcpy in slimpro_i2c_blkwr. Fix this bug by checking the value of writelen. Fixes: f6505fbabc42 ("i2c: add SLIMpro I2C device driver on APM X-Gene platform") Signed-off-by: Wei Chen Cc: stable@vger.kernel.org Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman Fixes: CVE-2023-2194 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1735 --- drivers/i2c/busses/i2c-xgene-slimpro.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i2c/busses/i2c-xgene-slimpro.c b/drivers/i2c/busses/i2c-xgene-slimpro.c index a7ac746018ad..7a746f413535 100644 --- a/drivers/i2c/busses/i2c-xgene-slimpro.c +++ b/drivers/i2c/busses/i2c-xgene-slimpro.c @@ -321,6 +321,9 @@ static int slimpro_i2c_blkwr(struct slimpro_i2c_dev *ctx, u32 chip, u32 msg[3]; int rc; + if (writelen > I2C_SMBUS_BLOCK_MAX) + return -EINVAL; + memcpy(ctx->dma_buffer, data, writelen); paddr = dma_map_single(ctx->dev, ctx->dma_buffer, writelen, DMA_TO_DEVICE); -- Gitee From df685b2b9d0690d299f3de7404fe53ca26693028 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:52 +0800 Subject: [PATCH 29/58] media: dvb-usb: az6027: fix null-ptr-deref in az6027_i2c_xfer() ANBZ: #4812 commit 0ed554fd769a19ea8464bb83e9ac201002ef74ad upstream. Wei Chen reports a kernel bug as blew: general protection fault, probably for non-canonical address KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] ... Call Trace: __i2c_transfer+0x77e/0x1930 drivers/i2c/i2c-core-base.c:2109 i2c_transfer+0x1d5/0x3d0 drivers/i2c/i2c-core-base.c:2170 i2cdev_ioctl_rdwr+0x393/0x660 drivers/i2c/i2c-dev.c:297 i2cdev_ioctl+0x75d/0x9f0 drivers/i2c/i2c-dev.c:458 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fd834a8bded In az6027_i2c_xfer(), if msg[i].addr is 0x99, a null-ptr-deref will caused when accessing msg[i].buf. For msg[i].len is 0 and msg[i].buf is null. Fix this by checking msg[i].len in az6027_i2c_xfer(). Link: https://lore.kernel.org/lkml/CAO4mrfcPHB5aQJO=mpqV+p8mPLNg-Fok0gw8gZ=zemAfMGTzMg@mail.gmail.com/ Link: https://lore.kernel.org/linux-media/20221120065918.2160782-1-zhongbaisong@huawei.com Fixes: 76f9a820c867 ("V4L/DVB: AZ6027: Initial import of the driver") Reported-by: Wei Chen Signed-off-by: Baisong Zhong Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Fixes: CVE-2023-28328 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1600 --- drivers/media/usb/dvb-usb/az6027.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/media/usb/dvb-usb/az6027.c b/drivers/media/usb/dvb-usb/az6027.c index 6321b8e30261..555c8ac44881 100644 --- a/drivers/media/usb/dvb-usb/az6027.c +++ b/drivers/media/usb/dvb-usb/az6027.c @@ -977,6 +977,10 @@ static int az6027_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int n if (msg[i].addr == 0x99) { req = 0xBE; index = 0; + if (msg[i].len < 1) { + i = -EOPNOTSUPP; + break; + } value = msg[i].buf[0] & 0x00ff; length = 1; az6027_usb_out_op(d, req, value, index, data, length); -- Gitee From 7e626df92e1d33227450f350f4fb6e83be0f4602 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:52 +0800 Subject: [PATCH 30/58] media: uvcvideo: Avoid cyclic entity chains due to malformed USB descriptors ANBZ: #4485 commit 68035c80e129c4cfec659aac4180354530b26527 upstream. Way back in 2017, fuzzing the 4.14-rc2 USB stack with syzkaller kicked up the following WARNING from the UVC chain scanning code: | list_add double add: new=ffff880069084010, prev=ffff880069084010, | next=ffff880067d22298. | ------------[ cut here ]------------ | WARNING: CPU: 1 PID: 1846 at lib/list_debug.c:31 __list_add_valid+0xbd/0xf0 | Modules linked in: | CPU: 1 PID: 1846 Comm: kworker/1:2 Not tainted | 4.14.0-rc2-42613-g1488251d1a98 #238 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 | Workqueue: usb_hub_wq hub_event | task: ffff88006b01ca40 task.stack: ffff880064358000 | RIP: 0010:__list_add_valid+0xbd/0xf0 lib/list_debug.c:29 | RSP: 0018:ffff88006435ddd0 EFLAGS: 00010286 | RAX: 0000000000000058 RBX: ffff880067d22298 RCX: 0000000000000000 | RDX: 0000000000000058 RSI: ffffffff85a58800 RDI: ffffed000c86bbac | RBP: ffff88006435dde8 R08: 1ffff1000c86ba52 R09: 0000000000000000 | R10: 0000000000000002 R11: 0000000000000000 R12: ffff880069084010 | R13: ffff880067d22298 R14: ffff880069084010 R15: ffff880067d222a0 | FS: 0000000000000000(0000) GS:ffff88006c900000(0000) knlGS:0000000000000000 | CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 | CR2: 0000000020004ff2 CR3: 000000006b447000 CR4: 00000000000006e0 | Call Trace: | __list_add ./include/linux/list.h:59 | list_add_tail+0x8c/0x1b0 ./include/linux/list.h:92 | uvc_scan_chain_forward.isra.8+0x373/0x416 | drivers/media/usb/uvc/uvc_driver.c:1471 | uvc_scan_chain drivers/media/usb/uvc/uvc_driver.c:1585 | uvc_scan_device drivers/media/usb/uvc/uvc_driver.c:1769 | uvc_probe+0x77f2/0x8f00 drivers/media/usb/uvc/uvc_driver.c:2104 Looking into the output from usbmon, the interesting part is the following data packet: ffff880069c63e00 30710169 C Ci:1:002:0 0 143 = 09028f00 01030080 00090403 00000e01 00000924 03000103 7c003328 010204db If we drop the lead configuration and interface descriptors, we're left with an output terminal descriptor describing a generic display: /* Output terminal descriptor */ buf[0] 09 buf[1] 24 buf[2] 03 /* UVC_VC_OUTPUT_TERMINAL */ buf[3] 00 /* ID */ buf[4] 01 /* type == 0x0301 (UVC_OTT_DISPLAY) */ buf[5] 03 buf[6] 7c buf[7] 00 /* source ID refers to self! */ buf[8] 33 The problem with this descriptor is that it is self-referential: the source ID of 0 matches itself! This causes the 'struct uvc_entity' representing the display to be added to its chain list twice during 'uvc_scan_chain()': once via 'uvc_scan_chain_entity()' when it is processed directly from the 'dev->entities' list and then again immediately afterwards when trying to follow the source ID in 'uvc_scan_chain_forward()' Add a check before adding an entity to a chain list to ensure that the entity is not already part of a chain. Cc: Fixes: c0efd232929c ("V4L/DVB (8145a): USB Video Class driver") Reported-by: Andrey Konovalov Signed-off-by: Will Deacon Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab Signed-off-by: shi.xue Fixes:CVE-2020-0404 Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1439 --- drivers/media/usb/uvc/uvc_driver.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c index 063e229ead5e..38c73cdbef70 100644 --- a/drivers/media/usb/uvc/uvc_driver.c +++ b/drivers/media/usb/uvc/uvc_driver.c @@ -1482,6 +1482,11 @@ static int uvc_scan_chain_forward(struct uvc_video_chain *chain, break; if (forward == prev) continue; + if (forward->chain.next || forward->chain.prev) { + uvc_trace(UVC_TRACE_DESCR, "Found reference to " + "entity %d already in chain.\n", forward->id); + return -EINVAL; + } switch (UVC_ENTITY_TYPE(forward)) { case UVC_VC_EXTENSION_UNIT: @@ -1563,6 +1568,13 @@ static int uvc_scan_chain_backward(struct uvc_video_chain *chain, return -1; } + if (term->chain.next || term->chain.prev) { + uvc_trace(UVC_TRACE_DESCR, "Found reference to " + "entity %d already in chain.\n", + term->id); + return -EINVAL; + } + if (uvc_trace_param & UVC_TRACE_PROBE) printk(KERN_CONT " %d", term->id); -- Gitee From 2b6d6ece15e1de98aab47675f6f7fd0e98b4acdd Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:53 +0800 Subject: [PATCH 31/58] xirc2ps_cs: Fix use after free bug in xirc2ps_detach ANBZ: #4685 commit e8d20c3ded59a092532513c9bd030d1ea66f5f44 upstream. In xirc2ps_probe, the local->tx_timeout_task was bounded with xirc2ps_tx_timeout_task. When timeout occurs, it will call xirc_tx_timeout->schedule_work to start the work. When we call xirc2ps_detach to remove the driver, there may be a sequence as follows: Stop responding to timeout tasks and complete scheduled tasks before cleanup in xirc2ps_detach, which will fix the problem. CPU0 CPU1 |xirc2ps_tx_timeout_task xirc2ps_detach | free_netdev | kfree(dev); | | | do_reset | //use dev Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Zheng Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Fixes: CVE-2023-1670 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1603 --- drivers/net/ethernet/xircom/xirc2ps_cs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c index fd5288ff53b5..e3438cef5f9c 100644 --- a/drivers/net/ethernet/xircom/xirc2ps_cs.c +++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c @@ -503,6 +503,11 @@ static void xirc2ps_detach(struct pcmcia_device *link) { struct net_device *dev = link->priv; + struct local_info *local = netdev_priv(dev); + + netif_carrier_off(dev); + netif_tx_disable(dev); + cancel_work_sync(&local->tx_timeout_task); dev_dbg(&link->dev, "detach\n"); -- Gitee From 857e5460716f049a4d4c1ad0f342895e237aa947 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:54 +0800 Subject: [PATCH 32/58] prlimit: do_prlimit needs to have a speculation check ANBZ: #4933 commit 739790605705ddcf18f21782b9c99ad7d53a8c11 upstream. do_prlimit() adds the user-controlled resource value to a pointer that will subsequently be dereferenced. In order to help prevent this codepath from being used as a spectre "gadget" a barrier needs to be added after checking the range. Reported-by: Jordy Zomer Tested-by: Jordy Zomer Suggested-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Fixes: CVE-2023-0458 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1731 --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sys.c b/kernel/sys.c index 6f7a0c9e83f3..3c990df3498e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1531,6 +1531,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); + if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; -- Gitee From 818ef06844e9943c243c4eb716247d848ba5d743 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:55 +0800 Subject: [PATCH 33/58] nfc: st-nci: Fix use after free bug in ndlc_remove due to race condition ANBZ: #4780 commit 5000fe6c27827a61d8250a7e4a1d26c3298ef4f6 upstream. This bug influences both st_nci_i2c_remove and st_nci_spi_remove. Take st_nci_i2c_remove as an example. In st_nci_i2c_probe, it called ndlc_probe and bound &ndlc->sm_work with llt_ndlc_sm_work. When it calls ndlc_recv or timeout handler, it will finally call schedule_work to start the work. When we call st_nci_i2c_remove to remove the driver, there may be a sequence as follows: Fix it by finishing the work before cleanup in ndlc_remove CPU0 CPU1 |llt_ndlc_sm_work st_nci_i2c_remove | ndlc_remove | st_nci_remove | nci_free_device| kfree(ndev) | //free ndlc->ndev | |llt_ndlc_rcv_queue |nci_recv_frame |//use ndlc->ndev Fixes: 35630df68d60 ("NFC: st21nfcb: Add driver for STMicroelectronics ST21NFCB NFC chip") Signed-off-by: Zheng Wang Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230312160837.2040857-1-zyytlz.wz@163.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Fixes: CVE-2023-1990 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1578 --- drivers/nfc/st-nci/ndlc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c index f26d938d240f..12d73f9dbe9f 100644 --- a/drivers/nfc/st-nci/ndlc.c +++ b/drivers/nfc/st-nci/ndlc.c @@ -297,13 +297,15 @@ EXPORT_SYMBOL(ndlc_probe); void ndlc_remove(struct llt_ndlc *ndlc) { - st_nci_remove(ndlc->ndev); - /* cancel timers */ del_timer_sync(&ndlc->t1_timer); del_timer_sync(&ndlc->t2_timer); ndlc->t2_active = false; ndlc->t1_active = false; + /* cancel work */ + cancel_work_sync(&ndlc->sm_work); + + st_nci_remove(ndlc->ndev); skb_queue_purge(&ndlc->rcv_q); skb_queue_purge(&ndlc->send_q); -- Gitee From ee1c51d6790a40c5dc3a575a77e2a6c917d08e52 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:56 +0800 Subject: [PATCH 34/58] HID: roccat: Fix use-after-free in roccat_read() ANBZ: #5372 commit cacdb14b1c8d3804a3a7d31773bc7569837b71a4 upstream. roccat_report_event() is responsible for registering roccat-related reports in struct roccat_device. int roccat_report_event(int minor, u8 const *data) { struct roccat_device *device; struct roccat_reader *reader; struct roccat_report *report; uint8_t *new_value; device = devices[minor]; new_value = kmemdup(data, device->report_size, GFP_ATOMIC); if (!new_value) return -ENOMEM; report = &device->cbuf[device->cbuf_end]; /* passing NULL is safe */ kfree(report->value); ... The registered report is stored in the struct roccat_device member "struct roccat_report cbuf[ROCCAT_CBUF_SIZE];". If more reports are received than the "ROCCAT_CBUF_SIZE" value, kfree() the saved report from cbuf[0] and allocates a new reprot. Since there is no lock when this kfree() is performed, kfree() can be performed even while reading the saved report. static ssize_t roccat_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct roccat_reader *reader = file->private_data; struct roccat_device *device = reader->device; struct roccat_report *report; ssize_t retval = 0, len; DECLARE_WAITQUEUE(wait, current); mutex_lock(&device->cbuf_lock); ... report = &device->cbuf[reader->cbuf_start]; /* * If report is larger than requested amount of data, rest of report * is lost! */ len = device->report_size > count ? count : device->report_size; if (copy_to_user(buffer, report->value, len)) { retval = -EFAULT; goto exit_unlock; } ... The roccat_read() function receives the device->cbuf report and delivers it to the user through copy_to_user(). If the N+ROCCAT_CBUF_SIZE th report is received while copying of the Nth report->value is in progress, the pointer that copy_to_user() is working on is kfree()ed and UAF read may occur. (race condition) Since the device node of this driver does not set separate permissions, this is not a security vulnerability, but because it is used for requesting screen display of profile or dpi settings, a user using the roccat device can apply udev to this device node or There is a possibility to use it by giving. Signed-off-by: Hyunwoo Kim Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin Fixes: CVE-2022-41850 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1727 --- drivers/hid/hid-roccat.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c index 5be8de70c651..c9cec00b4e6e 100644 --- a/drivers/hid/hid-roccat.c +++ b/drivers/hid/hid-roccat.c @@ -260,6 +260,8 @@ int roccat_report_event(int minor, u8 const *data) if (!new_value) return -ENOMEM; + mutex_lock(&device->cbuf_lock); + report = &device->cbuf[device->cbuf_end]; /* passing NULL is safe */ @@ -279,6 +281,8 @@ int roccat_report_event(int minor, u8 const *data) reader->cbuf_start = (reader->cbuf_start + 1) % ROCCAT_CBUF_SIZE; } + mutex_unlock(&device->cbuf_lock); + wake_up_interruptible(&device->wait); return 0; } -- Gitee From 738c4f91f6774eaa778371f5aea7b6aa43ac9cb2 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:57 +0800 Subject: [PATCH 35/58] hwmon: (xgene) Fix use after free bug in xgene_hwmon_remove due to race condition ANBZ: #4787 commit cb090e64cf25602b9adaf32d5dfc9c8bec493cd1 upstream. In xgene_hwmon_probe, &ctx->workq is bound with xgene_hwmon_evt_work. Then it will be started. If we remove the driver which will call xgene_hwmon_remove to clean up, there may be unfinished work. The possible sequence is as follows: Fix it by finishing the work before cleanup in xgene_hwmon_remove. CPU0 CPU1 |xgene_hwmon_evt_work xgene_hwmon_remove | kfifo_free(&ctx->async_msg_fifo);| | |kfifo_out_spinlocked |//use &ctx->async_msg_fifo Fixes: 2ca492e22cb7 ("hwmon: (xgene) Fix crash when alarm occurs before driver probe") Signed-off-by: Zheng Wang Link: https://lore.kernel.org/r/20230310084007.1403388-1-zyytlz.wz@163.com Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin Fixes: CVE-2023-1855 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1571 --- drivers/hwmon/xgene-hwmon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index a3cd91f23267..2dd19a420305 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -780,6 +780,7 @@ static int xgene_hwmon_remove(struct platform_device *pdev) { struct xgene_hwmon_dev *ctx = platform_get_drvdata(pdev); + cancel_work_sync(&ctx->workq); hwmon_device_unregister(ctx->hwmon_dev); kfifo_free(&ctx->async_msg_fifo); if (acpi_disabled) -- Gitee From 0ea32ec5746e7be537c9b791d1e8da68537ce922 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:58 +0800 Subject: [PATCH 36/58] staging: rtl8712: fix use after free bugs ANBZ: #4597 commit e230a4455ac3e9b112f0367d1b8e255e141afae0 upstream. _Read/Write_MACREG callbacks are NULL so the read/write_macreg_hdl() functions don't do anything except free the "pcmd" pointer. It results in a use after free. Delete them. Fixes: 2865d42c78a9 ("staging: r8712u: Add the new driver to the mainline kernel") Cc: stable Reported-by: Zheng Wang Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/Yw4ASqkYcUhUfoY2@kili Signed-off-by: Greg Kroah-Hartman Fixes: CVE-2022-4095 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1522 --- drivers/staging/rtl8712/rtl8712_cmd.c | 36 --------------------------- 1 file changed, 36 deletions(-) diff --git a/drivers/staging/rtl8712/rtl8712_cmd.c b/drivers/staging/rtl8712/rtl8712_cmd.c index 63bc811681d9..fb092d4ec521 100644 --- a/drivers/staging/rtl8712/rtl8712_cmd.c +++ b/drivers/staging/rtl8712/rtl8712_cmd.c @@ -129,34 +129,6 @@ static void r871x_internal_cmd_hdl(struct _adapter *padapter, u8 *pbuf) kfree(pdrvcmd->pbuf); } -static u8 read_macreg_hdl(struct _adapter *padapter, u8 *pbuf) -{ - void (*pcmd_callback)(struct _adapter *dev, struct cmd_obj *pcmd); - struct cmd_obj *pcmd = (struct cmd_obj *)pbuf; - - /* invoke cmd->callback function */ - pcmd_callback = cmd_callback[pcmd->cmdcode].callback; - if (!pcmd_callback) - r8712_free_cmd_obj(pcmd); - else - pcmd_callback(padapter, pcmd); - return H2C_SUCCESS; -} - -static u8 write_macreg_hdl(struct _adapter *padapter, u8 *pbuf) -{ - void (*pcmd_callback)(struct _adapter *dev, struct cmd_obj *pcmd); - struct cmd_obj *pcmd = (struct cmd_obj *)pbuf; - - /* invoke cmd->callback function */ - pcmd_callback = cmd_callback[pcmd->cmdcode].callback; - if (!pcmd_callback) - r8712_free_cmd_obj(pcmd); - else - pcmd_callback(padapter, pcmd); - return H2C_SUCCESS; -} - static u8 read_bbreg_hdl(struct _adapter *padapter, u8 *pbuf) { struct cmd_obj *pcmd = (struct cmd_obj *)pbuf; @@ -225,14 +197,6 @@ static struct cmd_obj *cmd_hdl_filter(struct _adapter *padapter, pcmd_r = NULL; switch (pcmd->cmdcode) { - case GEN_CMD_CODE(_Read_MACREG): - read_macreg_hdl(padapter, (u8 *)pcmd); - pcmd_r = pcmd; - break; - case GEN_CMD_CODE(_Write_MACREG): - write_macreg_hdl(padapter, (u8 *)pcmd); - pcmd_r = pcmd; - break; case GEN_CMD_CODE(_Read_BBREG): read_bbreg_hdl(padapter, (u8 *)pcmd); break; -- Gitee From 02d00e6394563e4d1722321512e3abe1561d1e57 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:58 +0800 Subject: [PATCH 37/58] HID: check empty report_list in hid_validate_values() ANBZ: #4539 commit b12fece4c64857e5fab4290bf01b2e0317a88456 upstream. Add a check for empty report_list in hid_validate_values(). The missing check causes a type confusion when issuing a list_entry() on an empty report_list. The problem is caused by the assumption that the device must have valid report_list. While this will be true for all normal HID devices, a suitably malicious device can violate the assumption. Fixes: 1b15d2e5b807 ("HID: core: fix validation of report id 0") Signed-off-by: Pietro Borrello Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin Fixes: CVE-2023-1073 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1515 --- drivers/hid/hid-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index b0c8fae7f903..01010a28332e 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -970,8 +970,8 @@ struct hid_report *hid_validate_values(struct hid_device *hid, * Validating on id 0 means we should examine the first * report in the list. */ - report = list_entry( - hid->report_enum[type].report_list.next, + report = list_first_entry_or_null( + &hid->report_enum[type].report_list, struct hid_report, list); } else { report = hid->report_enum[type].report_id_hash[id]; -- Gitee From a00fb34e8f2b345b09ada8f809ebd54a640cd87d Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:44:59 +0800 Subject: [PATCH 38/58] phy: tegra: xusb: Fix return value of tegra_xusb_find_port_node function ANBZ: #4383 commit 045a31b95509c8f25f5f04ec5e0dec5cd09f2c5f upstream. callers of tegra_xusb_find_port_node() function only do NULL checking for the return value. return NULL instead of ERR_PTR(-ENOMEM) to keep consistent. Signed-off-by: Miaoqian Lin Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20211213020507.1458-1-linmq006@gmail.com Signed-off-by: Vinod Koul Fixes: CVE-2023-23000 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1518 --- drivers/phy/tegra/xusb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c index de1b4ebe4de2..9e8fa1834a15 100644 --- a/drivers/phy/tegra/xusb.c +++ b/drivers/phy/tegra/xusb.c @@ -441,7 +441,7 @@ tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type, name = kasprintf(GFP_KERNEL, "%s-%u", type, index); if (!name) { of_node_put(ports); - return ERR_PTR(-ENOMEM); + return NULL; } np = of_get_child_by_name(ports, name); kfree(name); -- Gitee From a3e0333ec992cb8a962ba8b78520d8663ac99938 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:00 +0800 Subject: [PATCH 39/58] power: supply: da9150: Fix use after free bug in da9150_charger_remove due to race condition ANBZ: #4782 commit 06615d11cc78162dfd5116efb71f29eb29502d37 upstream. In da9150_charger_probe, &charger->otg_work is bound with da9150_charger_otg_work. da9150_charger_otg_ncb may be called to start the work. If we remove the module which will call da9150_charger_remove to make cleanup, there may be a unfinished work. The possible sequence is as follows: Fix it by canceling the work before cleanup in the da9150_charger_remove CPU0 CPUc1 |da9150_charger_otg_work da9150_charger_remove | power_supply_unregister | device_unregister | power_supply_dev_release| kfree(psy) | | | power_supply_changed(charger->usb); | //use Fixes: c1a281e34dae ("power: Add support for DA9150 Charger") Signed-off-by: Zheng Wang Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin Fixes: CVE-2023-30772 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1576 --- drivers/power/supply/da9150-charger.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/power/supply/da9150-charger.c b/drivers/power/supply/da9150-charger.c index 60099815296e..b2d38eb32288 100644 --- a/drivers/power/supply/da9150-charger.c +++ b/drivers/power/supply/da9150-charger.c @@ -666,6 +666,7 @@ static int da9150_charger_remove(struct platform_device *pdev) if (!IS_ERR_OR_NULL(charger->usb_phy)) usb_unregister_notifier(charger->usb_phy, &charger->otg_nb); + cancel_work_sync(&charger->otg_work); power_supply_unregister(charger->battery); power_supply_unregister(charger->usb); -- Gitee From e87154da167453f6e673c9e64d1133e0dc4a7819 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:01 +0800 Subject: [PATCH 40/58] Bluetooth: btsdio: fix use after free bug in btsdio_remove due to unfinished work ANBZ: #4777 commit 1e9ac114c4428fdb7ff4635b45d4f46017e8916f upstream. In btsdio_probe, &data->work was bound with btsdio_work.In btsdio_send_frame, it was started by schedule_work. If we call btsdio_remove with an unfinished job, there may be a race condition and cause UAF bug on hdev. Fixes: ddbaf13e3609 ("[Bluetooth] Add generic driver for Bluetooth SDIO devices") Signed-off-by: Zheng Wang Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin Fixes: CVE-2023-1989 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1580 --- drivers/bluetooth/btsdio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c index 20142bc77554..bd55bf7a9914 100644 --- a/drivers/bluetooth/btsdio.c +++ b/drivers/bluetooth/btsdio.c @@ -353,6 +353,7 @@ static void btsdio_remove(struct sdio_func *func) BT_DBG("func %p", func); + cancel_work_sync(&data->work); if (!data) return; -- Gitee From 57145fcf130a2133f67209c9298215f94830ca87 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:02 +0800 Subject: [PATCH 41/58] usb: gadget: rndis: prevent integer overflow in rndis_set_response() ANBZ: #4931 commit 65f3324f4b6fed78b8761c3b74615ecf0ffa81fa upstream. If "BufOffset" is very large the "BufOffset + 8" operation can have an integer overflow. Cc: stable@kernel.org Fixes: 38ea1eac7d88 ("usb: gadget: rndis: check size of RNDIS_MSG_SET command") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220301080424.GA17208@kili Signed-off-by: Greg Kroah-Hartman Fixes: CVE-2022-20423 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1733 --- drivers/usb/gadget/function/rndis.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index 6b5848a2c7e9..c2c45208abc5 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -640,6 +640,7 @@ static int rndis_set_response(struct rndis_params *params, BufLength = le32_to_cpu(buf->InformationBufferLength); BufOffset = le32_to_cpu(buf->InformationBufferOffset); if ((BufLength > RNDIS_MAX_TOTAL_SIZE) || + (BufOffset > RNDIS_MAX_TOTAL_SIZE) || (BufOffset + 8 >= RNDIS_MAX_TOTAL_SIZE)) return -EINVAL; -- Gitee From 72908e4b71e723a4be182e0e63204ca96c825a0c Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:03 +0800 Subject: [PATCH 42/58] drm/i915/gvt: fix double free bug in split_2MB_gtt_entry ANBZ: #4285 commit 4a61648af68f5ba4884f0e3b494ee1cabc4b6620 upstream. If intel_gvt_dma_map_guest_page failed, it will call ppgtt_invalidate_spt, which will finally free the spt. But the caller function ppgtt_populate_spt_by_guest_entry does not notice that, it will free spt again in its error path. Fix this by canceling the mapping of DMA address and freeing sub_spt. Besides, leave the handle of spt destroy to caller function instead of callee function when error occurs. Fixes: b901b252b6cf ("drm/i915/gvt: Add 2M huge gtt support") Signed-off-by: Zheng Wang Reviewed-by: Zhenyu Wang Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20221229165641.1192455-1-zyytlz.wz@163.com Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman Fixes: CVE-2022-3707 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1521 --- drivers/gpu/drm/i915/gvt/gtt.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 40b32b4d1d98..afbc648befec 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -1155,10 +1155,8 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu, for_each_shadow_entry(sub_spt, &sub_se, sub_index) { ret = intel_gvt_hypervisor_dma_map_guest_page(vgpu, start_gfn + sub_index, PAGE_SIZE, &dma_addr); - if (ret) { - ppgtt_invalidate_spt(spt); - return ret; - } + if (ret) + goto err; sub_se.val64 = se->val64; /* Copy the PAT field from PDE. */ @@ -1177,6 +1175,17 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu, ops->set_pfn(se, sub_spt->shadow_page.mfn); ppgtt_set_shadow_entry(spt, se, index); return 0; +err: + /* Cancel the existing addess mappings of DMA addr. */ + for_each_present_shadow_entry(sub_spt, &sub_se, sub_index) { + gvt_vdbg_mm("invalidate 4K entry\n"); + ppgtt_invalidate_pte(sub_spt, &sub_se); + } + /* Release the new allocated spt. */ + trace_spt_change(sub_spt->vgpu->id, "release", sub_spt, + sub_spt->guest_page.gfn, sub_spt->shadow_page.type); + ppgtt_free_spt(sub_spt); + return ret; } static int split_64KB_gtt_entry(struct intel_vgpu *vgpu, -- Gitee From 0a668f41264ce4c186d2c5428e3604757a292a26 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:04 +0800 Subject: [PATCH 43/58] kernel/relay.c: fix read_pos error when multiple readers ANBZ: #5684 commit 341a7213e5c1ce274cc0f02270054905800ea660 upstream. When reading, read_pos should start with bytes_consumed, not file->f_pos. Because when there is more than one reader, the read_pos corresponding to file->f_pos may have been consumed, which will cause the data that has been consumed to be read and the bytes_consumed update error. Signed-off-by: Pengcheng Yang Signed-off-by: Andrew Morton Reviewed-by: Jens Axboe Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: Al Viro e Link: http://lkml.kernel.org/r/1579691175-28949-1-git-send-email-yangpc@wangsu.com Signed-off-by: Linus Torvalds Fixes: CVE-2023-3268 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1836 --- kernel/relay.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 13c19f39e31e..e79c89429ff8 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -996,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1064,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1136,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; -- Gitee From deb1fea78a839b9e43f014a8d3865b1067fff9f7 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:05 +0800 Subject: [PATCH 44/58] relayfs: fix out-of-bounds access in relay_file_read ANBZ: #5684 commit 43ec16f1450f4936025a9bdf1a273affdb9732c1 upstream. There is a crash in relay_file_read, as the var from point to the end of last subbuf. The oops looks something like: pc : __arch_copy_to_user+0x180/0x310 lr : relay_file_read+0x20c/0x2c8 Call trace: __arch_copy_to_user+0x180/0x310 full_proxy_read+0x68/0x98 vfs_read+0xb0/0x1d0 ksys_read+0x6c/0xf0 __arm64_sys_read+0x20/0x28 el0_svc_common.constprop.3+0x84/0x108 do_el0_svc+0x74/0x90 el0_svc+0x1c/0x28 el0_sync_handler+0x88/0xb0 el0_sync+0x148/0x180 We get the condition by analyzing the vmcore: 1). The last produced byte and last consumed byte both at the end of the last subbuf 2). A softirq calls function(e.g __blk_add_trace) to write relay buffer occurs when an program is calling relay_file_read_avail(). relay_file_read relay_file_read_avail relay_file_read_consume(buf, 0, 0); //interrupted by softirq who will write subbuf .... return 1; //read_start point to the end of the last subbuf read_start = relay_file_read_start_pos //avail is equal to subsize avail = relay_file_read_subbuf_avail //from points to an invalid memory address from = buf->start + read_start //system is crashed copy_to_user(buffer, from, avail) Link: https://lkml.kernel.org/r/20230419040203.37676-1-zhang.zhengming@h3c.com Fixes: 8d62fdebdaf9 ("relay file read: start-pos fix") Signed-off-by: Zhang Zhengming Reviewed-by: Zhao Lei Reviewed-by: Zhou Kete Reviewed-by: Pengcheng Yang Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton Fixes: CVE-2023-3268 Signed-off-by: Xiao Long Reviewed-by: Xunlei Pang Link: https://gitee.com/anolis/cloud-kernel/pulls/1836 --- kernel/relay.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/relay.c b/kernel/relay.c index e79c89429ff8..b8703edb50dd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1076,7 +1076,8 @@ static size_t relay_file_read_start_pos(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; - size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; + size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed) + % (n_subbufs * subbuf_size); read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; -- Gitee From 983a3ff6ea009e66f89f2a96af6a8d9b519ba89d Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:05 +0800 Subject: [PATCH 45/58] bpf/verifier: display non-spill stack slot types in print_verifier_state ANBZ: #5530 commit 8efea21d333d21e1f9177579ffdc69556314f603 upstream If a stack slot does not hold a spilled register (STACK_SPILL), then each of its eight bytes could potentially have a different slot_type. This information can be important for debugging, and previously we either did not print anything for the stack slot, or just printed fp-X=0 in the case where its first byte was STACK_ZERO. Instead, print eight characters with either 0 (STACK_ZERO), m (STACK_MISC) or ? (STACK_INVALID) for any stack slot which is neither STACK_SPILL nor entirely STACK_INVALID. Signed-off-by: Edward Cree Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b74115452c6..5ef094796925 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -263,6 +263,13 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', +}; + static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { @@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[ + state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); - } - if (state->stack[i].slot_type[0] == STACK_ZERO) - verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); + else + verbose(env, "=%s", types_buf); } verbose(env, "\n"); } -- Gitee From 9ff6b18220a7351c67a00750f5892eb22aeeda72 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:07 +0800 Subject: [PATCH 46/58] tools/bpf: move bpf/lib netlink related functions into a new file ANBZ: #5530 commit f7010770fbac47b1fc9fb723b1d2019eb23c04f2 upstream There are no functionality change for this patch. In the subsequent patches, more netlink related library functions will be added and a separate file is better than cluttering bpf.c. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/bpf.c | 129 ------------------------------- tools/lib/bpf/netlink.c | 165 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 130 deletions(-) create mode 100644 tools/lib/bpf/netlink.c diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 6eb9bacd1948..7bc31c905018 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o +libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 482025b72839..eba3ad604244 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -28,16 +28,8 @@ #include #include "bpf.h" #include "libbpf.h" -#include "nlattr.h" -#include -#include -#include #include -#ifndef SOL_NETLINK -#define SOL_NETLINK 270 -#endif - /* * When building perf, unistd.h is overridden. __NR_bpf is * required to be defined explicitly. @@ -521,127 +513,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); } -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - socklen_t addrlen; - int one = 1; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - return -errno; - } - - if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, - &one, sizeof(one)) < 0) { - fprintf(stderr, "Netlink error reporting not supported\n"); - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - ret = -errno; - goto cleanup; - } - - addrlen = sizeof(sa); - if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { - ret = -errno; - goto cleanup; - } - - if (addrlen != sizeof(sa)) { - ret = -LIBBPF_ERRNO__INTERNAL; - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = ifindex; - - /* started nested attribute for XDP */ - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | IFLA_XDP; - nla->nla_len = NLA_HDRLEN; - - /* add XDP fd */ - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FD; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len += nla_xdp->nla_len; - - /* if user passed in any flags, add those too */ - if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FLAGS; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; - } - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); - - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - ret = -errno; - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - ret = -errno; - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != sa.nl_pid) { - ret = -LIBBPF_ERRNO__WRNGPID; - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - ret = -LIBBPF_ERRNO__INVSEQ; - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - ret = err->error; - nla_dump_errormsg(nh); - goto cleanup; - case NLMSG_DONE: - break; - default: - break; - } - } - - ret = 0; - -cleanup: - close(sock); - return ret; -} - int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log) { diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c new file mode 100644 index 000000000000..ccaa991fe9d8 --- /dev/null +++ b/tools/lib/bpf/netlink.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "nlattr.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +static int bpf_netlink_open(__u32 *nl_pid) +{ + struct sockaddr_nl sa; + socklen_t addrlen; + int one = 1, ret; + int sock; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) + return -errno; + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + fprintf(stderr, "Netlink error reporting not supported\n"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + *nl_pid = sa.nl_pid; + return sock; + +cleanup: + close(sock); + return ret; +} + +static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq) +{ + struct nlmsgerr *err; + struct nlmsghdr *nh; + char buf[4096]; + int len, ret; + + while (1) { + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + ret = -errno; + goto done; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto done; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto done; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + nla_dump_errormsg(nh); + goto done; + case NLMSG_DONE: + return 0; + default: + break; + } + } + } + ret = 0; +done: + return ret; +} + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) +{ + int sock, seq = 0, ret; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + __u32 nl_pid; + + sock = bpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + /* started nested attribute for XDP */ + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | IFLA_XDP; + nla->nla_len = NLA_HDRLEN; + + /* add XDP fd */ + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FD; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len += nla_xdp->nla_len; + + /* if user passed in any flags, add those too */ + if (flags) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FLAGS; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); + memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); + nla->nla_len += nla_xdp->nla_len; + } + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto cleanup; + } + ret = bpf_netlink_recv(sock, nl_pid, seq); + +cleanup: + close(sock); + return ret; +} -- Gitee From c2c5e66ef5535197e3180fc2bd9932dbd60cb5c9 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:08 +0800 Subject: [PATCH 47/58] tools/bpf: add more netlink functionalities in lib/bpf ANBZ: #5530 commit 36f1678d9e0b5d2e0236046d9659e0348b4719a8 upstream This patch added a few netlink attribute parsing functions and the netlink API functions to query networking links, tc classes, tc qdiscs and tc filters. For example, the following API is to get networking links: int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg, void *cookie); Note that when the API is called, the user also provided a callback function with the following signature: int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); The "cookie" is the parameter the user passed to the API and will be available for the callback function. The "msg" is the information about the result, e.g., ifinfomsg or tcmsg. The "tb" is the parsed netlink attributes. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/lib/bpf/libbpf.h | 16 ++++ tools/lib/bpf/libbpf_errno.c | 1 + tools/lib/bpf/netlink.c | 165 ++++++++++++++++++++++++++++++++++- tools/lib/bpf/nlattr.c | 33 ++++--- tools/lib/bpf/nlattr.h | 38 ++++++++ 5 files changed, 238 insertions(+), 15 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 96c55fac54c3..e3b00e23e181 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -46,6 +46,7 @@ enum libbpf_errno { LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ + LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */ __LIBBPF_ERRNO__END, }; @@ -297,4 +298,19 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size, unsigned long page_size, void **buf, size_t *buf_len, bpf_perf_event_print_t fn, void *priv); + +struct nlmsghdr; +struct nlattr; +typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); +typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t, + void *cookie); +int bpf_netlink_open(unsigned int *nl_pid); +int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg, + void *cookie); +int nl_get_class(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_class_nlmsg, void *cookie); +int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_qdisc_nlmsg, void *cookie); +int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + dump_nlmsg_t dump_filter_nlmsg, void *cookie); #endif diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c index d2d17226d9d6..451283328fa9 100644 --- a/tools/lib/bpf/libbpf_errno.c +++ b/tools/lib/bpf/libbpf_errno.c @@ -43,6 +43,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", + [ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing", }; int libbpf_strerror(int err, char *buf, size_t size) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index ccaa991fe9d8..469e068dd0c5 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -18,7 +18,7 @@ #define SOL_NETLINK 270 #endif -static int bpf_netlink_open(__u32 *nl_pid) +int bpf_netlink_open(__u32 *nl_pid) { struct sockaddr_nl sa; socklen_t addrlen; @@ -61,7 +61,9 @@ static int bpf_netlink_open(__u32 *nl_pid) return ret; } -static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq) +static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq, + __dump_nlmsg_t _fn, dump_nlmsg_t fn, + void *cookie) { struct nlmsgerr *err; struct nlmsghdr *nh; @@ -98,6 +100,11 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq) default: break; } + if (_fn) { + ret = _fn(nh, fn, cookie); + if (ret) + return ret; + } } } ret = 0; @@ -157,9 +164,161 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) ret = -errno; goto cleanup; } - ret = bpf_netlink_recv(sock, nl_pid, seq); + ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL); cleanup: close(sock); return ret; } + +static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg, + void *cookie) +{ + struct nlattr *tb[IFLA_MAX + 1], *attr; + struct ifinfomsg *ifi = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi))); + if (nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_link_nlmsg(cookie, ifi, tb); +} + +int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg, + void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifm.ifi_family = AF_PACKET, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg, + dump_link_nlmsg, cookie); +} + +static int __dump_class_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_class_nlmsg, void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_class_nlmsg(cookie, t, tb); +} + +int nl_get_class(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_class_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTCLASS, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg, + dump_class_nlmsg, cookie); +} + +static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_qdisc_nlmsg, void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_qdisc_nlmsg(cookie, t, tb); +} + +int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_qdisc_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETQDISC, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg, + dump_qdisc_nlmsg, cookie); +} + +static int __dump_filter_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_filter_nlmsg, void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_filter_nlmsg(cookie, t, tb); +} + +int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + dump_nlmsg_t dump_filter_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTFILTER, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + .t.tcm_parent = handle, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg, + dump_filter_nlmsg, cookie); +} diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index 4719434278b2..49f514119bdb 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -26,11 +26,6 @@ static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_FLAG] = 0, }; -static int nla_len(const struct nlattr *nla) -{ - return nla->nla_len - NLA_HDRLEN; -} - static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { int totlen = NLA_ALIGN(nla->nla_len); @@ -46,11 +41,6 @@ static int nla_ok(const struct nlattr *nla, int remaining) nla->nla_len <= remaining; } -static void *nla_data(const struct nlattr *nla) -{ - return (char *) nla + NLA_HDRLEN; -} - static int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; @@ -114,8 +104,8 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) * @see nla_validate * @return 0 on success or a negative error code. */ -static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, - struct nla_policy *policy) +int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy) { struct nlattr *nla; int rem, err; @@ -146,6 +136,25 @@ static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int return err; } +/** + * Create attribute index based on nested attribute + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg nla Nested Attribute. + * @arg policy Attribute validation policy. + * + * Feeds the stream of attributes nested into the specified attribute + * to nla_parse(). + * + * @see nla_parse + * @return 0 on success or a negative error code. + */ +int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + struct nla_policy *policy) +{ + return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy); +} + /* dump netlink extended ack error message */ int nla_dump_errormsg(struct nlmsghdr *nlh) { diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 931a71f68f93..a6e2396bce7c 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -67,6 +67,44 @@ struct nla_policy { nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) +/** + * nla_data - head of payload + * @nla: netlink attribute + */ +static inline void *nla_data(const struct nlattr *nla) +{ + return (char *) nla + NLA_HDRLEN; +} + +static inline uint8_t nla_getattr_u8(const struct nlattr *nla) +{ + return *(uint8_t *)nla_data(nla); +} + +static inline uint32_t nla_getattr_u32(const struct nlattr *nla) +{ + return *(uint32_t *)nla_data(nla); +} + +static inline const char *nla_getattr_str(const struct nlattr *nla) +{ + return (const char *)nla_data(nla); +} + +/** + * nla_len - length of payload + * @nla: netlink attribute + */ +static inline int nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy); +int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + struct nla_policy *policy); + int nla_dump_errormsg(struct nlmsghdr *nlh); #endif /* __NLATTR_H */ -- Gitee From 37c3d5ff798df8e2809412109e6ba279bf265781 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:09 +0800 Subject: [PATCH 48/58] tools/bpf: bpftool: add net support ANBZ: #5530 commit f6f3bac08ff9855d803081a353a1fafaa8845739 upstream Add "bpftool net" support. Networking devices are enumerated to dump device index/name associated with xdp progs. For each networking device, tc classes and qdiscs are enumerated in order to check their bpf filters. In addition, root handle and clsact ingress/egress are also checked for bpf filters. Not all filter information is printed out. Only ifindex, kind, filter name, prog_id and tag are printed out, which are good enough to show attachment information. If the filter action is a bpf action, its bpf program id, bpf name and tag will be printed out as well. For example, $ ./bpftool net xdp [ ifindex 2 devname eth0 prog_id 198 ] tc_filters [ ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb] prog_id 111727 tag d08fe3b4319bc2fd act [] ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp prog_id 130246 tag 3f265c7f26db62c9 act [] ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact] prog_id 111726 tag 99a197826974c876 ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp prog_id 108619 tag dc4630674fd72dcc act [] ifindex 2 kind qdisc_clsact_egress name fbflow_egress prog_id 130245 tag 72d2d830d6888d2c ] $ ./bpftool -jp net [{ "xdp": [{ "ifindex": 2, "devname": "eth0", "prog_id": 198 } ], "tc_filters": [{ "ifindex": 2, "kind": "qdisc_htb", "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", "prog_id": 111727, "tag": "d08fe3b4319bc2fd", "act": [] },{ "ifindex": 2, "kind": "qdisc_clsact_ingress", "name": "fbflow_icmp", "prog_id": 130246, "tag": "3f265c7f26db62c9", "act": [] },{ "ifindex": 2, "kind": "qdisc_clsact_egress", "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", "prog_id": 111726, "tag": "99a197826974c876" },{ "ifindex": 2, "kind": "qdisc_clsact_egress", "name": "cls_fg_dscp", "prog_id": 108619, "tag": "dc4630674fd72dcc", "act": [] },{ "ifindex": 2, "kind": "qdisc_clsact_egress", "name": "fbflow_egress", "prog_id": 130245, "tag": "72d2d830d6888d2c" } ] } ] Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- .../bpf/bpftool/Documentation/bpftool-net.rst | 133 ++++++++++ tools/bpf/bpftool/Documentation/bpftool.rst | 6 +- tools/bpf/bpftool/bash-completion/bpftool | 17 +- tools/bpf/bpftool/main.c | 3 +- tools/bpf/bpftool/main.h | 7 + tools/bpf/bpftool/net.c | 233 ++++++++++++++++++ tools/bpf/bpftool/netlink_dumper.c | 181 ++++++++++++++ tools/bpf/bpftool/netlink_dumper.h | 103 ++++++++ 8 files changed, 676 insertions(+), 7 deletions(-) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-net.rst create mode 100644 tools/bpf/bpftool/net.c create mode 100644 tools/bpf/bpftool/netlink_dumper.c create mode 100644 tools/bpf/bpftool/netlink_dumper.h diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst new file mode 100644 index 000000000000..48a61837a264 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -0,0 +1,133 @@ +================ +bpftool-net +================ +------------------------------------------------------------------------------- +tool for inspection of netdev/tc related bpf prog attachments +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **net** *COMMAND* + + *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] } + + *COMMANDS* := + { **show** | **list** } [ **dev** name ] | **help** + +NET COMMANDS +============ + +| **bpftool** **net { show | list } [ dev name ]** +| **bpftool** **net help** + +DESCRIPTION +=========== + **bpftool net { show | list } [ dev name ]** + List all networking device driver and tc attachment in the system. + + Output will start with all xdp program attachment, followed by + all tc class/qdisc bpf program attachments. Both xdp programs and + tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc filter**, + the order will be first all bpf programs attached to tc classes, then + all bpf programs attached to non clsact qdiscs, and finally all + bpf programs attached to root and clsact qdisc. + + **bpftool net help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -v, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + +EXAMPLES +======== + +| **# bpftool net** + +:: + + xdp [ + ifindex 2 devname eth0 prog_id 198 + ] + tc_filters [ + ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb] + prog_id 111727 tag d08fe3b4319bc2fd act [] + ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp + prog_id 130246 tag 3f265c7f26db62c9 act [] + ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact] + prog_id 111726 tag 99a197826974c876 + ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp + prog_id 108619 tag dc4630674fd72dcc act [] + ifindex 2 kind qdisc_clsact_egress name fbflow_egress + prog_id 130245 tag 72d2d830d6888d2c + ] + +| +| **# bpftool -jp net** + +:: + + [{ + "xdp": [{ + "ifindex": 2, + "devname": "eth0", + "prog_id": 198 + } + ], + "tc_filters": [{ + "ifindex": 2, + "kind": "qdisc_htb", + "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", + "prog_id": 111727, + "tag": "d08fe3b4319bc2fd", + "act": [] + },{ + "ifindex": 2, + "kind": "qdisc_clsact_ingress", + "name": "fbflow_icmp", + "prog_id": 130246, + "tag": "3f265c7f26db62c9", + "act": [] + },{ + "ifindex": 2, + "kind": "qdisc_clsact_egress", + "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", + "prog_id": 111726, + "tag": "99a197826974c876" + },{ + "ifindex": 2, + "kind": "qdisc_clsact_egress", + "name": "cls_fg_dscp", + "prog_id": 108619, + "tag": "dc4630674fd72dcc", + "act": [] + },{ + "ifindex": 2, + "kind": "qdisc_clsact_egress", + "name": "fbflow_egress", + "prog_id": 130245, + "tag": "72d2d830d6888d2c" + } + ] + } + ] + + +SEE ALSO +======== + **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index b6f5d560460d..8dda77daeda9 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** **version** - *OBJECT* := { **map** | **program** | **cgroup** | **perf** } + *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** } *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } | { **-j** | **--json** } [{ **-p** | **--pretty** }] } @@ -32,6 +32,8 @@ SYNOPSIS *PERF-COMMANDS* := { **show** | **list** | **help** } + *NET-COMMANDS* := { **show** | **list** | **help** } + DESCRIPTION =========== *bpftool* allows for inspection and simple modification of BPF objects @@ -58,4 +60,4 @@ OPTIONS SEE ALSO ======== **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), **bpftool-net**\ (8) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index c2b6b2176f3b..ec577af2e5f0 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -494,10 +494,10 @@ _bpftool() _filedir return 0 ;; - tree) - _filedir - return 0 - ;; + tree) + _filedir + return 0 + ;; attach|detach) local ATTACH_TYPES='ingress egress sock_create sock_ops \ device bind4 bind6 post_bind4 post_bind6 connect4 \ @@ -552,6 +552,15 @@ _bpftool() ;; esac ;; + net) + case $command in + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help \ + show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index d15a62be6cf0..79dc3f193547 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -85,7 +85,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf }\n" + " OBJECT := { prog | map | cgroup | perf | net }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -215,6 +215,7 @@ static const struct cmd cmds[] = { { "map", do_map }, { "cgroup", do_cgroup }, { "perf", do_perf }, + { "net", do_net }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 057a227bdb9f..0a9d2779635c 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -136,6 +136,7 @@ int do_map(int argc, char **arg); int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); +int do_net(int argc, char **arg); int prog_parse_fd(int *argc, char ***argv); int map_parse_fd(int *argc, char ***argv); @@ -165,4 +166,10 @@ struct btf_dumper { */ int btf_dumper_type(const struct btf_dumper *d, __u32 type_id, const void *data); + +struct nlattr; +struct ifinfomsg; +struct tcmsg; +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb); +int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind); #endif diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c new file mode 100644 index 000000000000..77dd73dd9ade --- /dev/null +++ b/tools/bpf/bpftool/net.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "main.h" +#include "netlink_dumper.h" + +struct bpf_netdev_t { + int *ifindex_array; + int used_len; + int array_len; + int filter_idx; +}; + +struct tc_kind_handle { + char kind[64]; + int handle; +}; + +struct bpf_tcinfo_t { + struct tc_kind_handle *handle_array; + int used_len; + int array_len; + bool is_qdisc; +}; + +static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_netdev_t *netinfo = cookie; + struct ifinfomsg *ifinfo = msg; + + if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index) + return 0; + + if (netinfo->used_len == netinfo->array_len) { + netinfo->ifindex_array = realloc(netinfo->ifindex_array, + (netinfo->array_len + 16) * sizeof(int)); + netinfo->array_len += 16; + } + netinfo->ifindex_array[netinfo->used_len++] = ifinfo->ifi_index; + + return do_xdp_dump(ifinfo, tb); +} + +static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_tcinfo_t *tcinfo = cookie; + struct tcmsg *info = msg; + + if (tcinfo->is_qdisc) { + /* skip clsact qdisc */ + if (tb[TCA_KIND] && + strcmp(nla_data(tb[TCA_KIND]), "clsact") == 0) + return 0; + if (info->tcm_handle == 0) + return 0; + } + + if (tcinfo->used_len == tcinfo->array_len) { + tcinfo->handle_array = realloc(tcinfo->handle_array, + (tcinfo->array_len + 16) * sizeof(struct tc_kind_handle)); + tcinfo->array_len += 16; + } + tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle; + snprintf(tcinfo->handle_array[tcinfo->used_len].kind, + sizeof(tcinfo->handle_array[tcinfo->used_len].kind), + "%s_%s", + tcinfo->is_qdisc ? "qdisc" : "class", + tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown"); + tcinfo->used_len++; + + return 0; +} + +static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + const char *kind = cookie; + + return do_filter_dump((struct tcmsg *)msg, tb, kind); +} + +static int show_dev_tc_bpf(int sock, unsigned int nl_pid, int ifindex) +{ + struct bpf_tcinfo_t tcinfo; + int i, handle, ret; + + tcinfo.handle_array = NULL; + tcinfo.used_len = 0; + tcinfo.array_len = 0; + + tcinfo.is_qdisc = false; + ret = nl_get_class(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + &tcinfo); + if (ret) + return ret; + + tcinfo.is_qdisc = true; + ret = nl_get_qdisc(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + &tcinfo); + if (ret) + return ret; + + for (i = 0; i < tcinfo.used_len; i++) { + ret = nl_get_filter(sock, nl_pid, ifindex, + tcinfo.handle_array[i].handle, + dump_filter_nlmsg, + tcinfo.handle_array[i].kind); + if (ret) + return ret; + } + + /* root, ingress and egress handle */ + handle = TC_H_ROOT; + ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, + "root"); + if (ret) + return ret; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, + "qdisc_clsact_ingress"); + if (ret) + return ret; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS); + ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, + "qdisc_clsact_egress"); + if (ret) + return ret; + + return 0; +} + +static int do_show(int argc, char **argv) +{ + int i, sock, ret, filter_idx = -1; + struct bpf_netdev_t dev_array; + unsigned int nl_pid; + char err_buf[256]; + + if (argc == 2) { + if (strcmp(argv[0], "dev") != 0) + usage(); + filter_idx = if_nametoindex(argv[1]); + if (filter_idx == 0) { + fprintf(stderr, "invalid dev name %s\n", argv[1]); + return -1; + } + } else if (argc != 0) { + usage(); + } + + sock = bpf_netlink_open(&nl_pid); + if (sock < 0) { + fprintf(stderr, "failed to open netlink sock\n"); + return -1; + } + + dev_array.ifindex_array = NULL; + dev_array.used_len = 0; + dev_array.array_len = 0; + dev_array.filter_idx = filter_idx; + + if (json_output) + jsonw_start_array(json_wtr); + NET_START_OBJECT; + NET_START_ARRAY("xdp", "\n"); + ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array); + NET_END_ARRAY("\n"); + + if (!ret) { + NET_START_ARRAY("tc_filters", "\n"); + for (i = 0; i < dev_array.used_len; i++) { + ret = show_dev_tc_bpf(sock, nl_pid, + dev_array.ifindex_array[i]); + if (ret) + break; + } + NET_END_ARRAY("\n"); + } + NET_END_OBJECT; + if (json_output) + jsonw_end_array(json_wtr); + + if (ret) { + if (json_output) + jsonw_null(json_wtr); + libbpf_strerror(ret, err_buf, sizeof(err_buf)); + fprintf(stderr, "Error: %s\n", err_buf); + } + free(dev_array.ifindex_array); + close(sock); + return ret; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s %s { show | list } [dev ]\n" + " %s %s help\n", + bin_name, argv[-2], bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_net(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c new file mode 100644 index 000000000000..e12494fd1d2e --- /dev/null +++ b/tools/bpf/bpftool/netlink_dumper.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#include +#include +#include +#include +#include + +#include +#include "main.h" +#include "netlink_dumper.h" + +static void xdp_dump_prog_id(struct nlattr **tb, int attr, + const char *type) +{ + if (!tb[attr]) + return; + + NET_DUMP_UINT(type, nla_getattr_u32(tb[attr])) +} + +static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, + const char *name) +{ + struct nlattr *tb[IFLA_XDP_MAX + 1]; + unsigned char mode; + + if (nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0) + return -1; + + if (!tb[IFLA_XDP_ATTACHED]) + return 0; + + mode = nla_getattr_u8(tb[IFLA_XDP_ATTACHED]); + if (mode == XDP_ATTACHED_NONE) + return 0; + + NET_START_OBJECT; + NET_DUMP_UINT("ifindex", ifindex); + + if (name) + NET_DUMP_STR("devname", name); + + if (tb[IFLA_XDP_PROG_ID]) + NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[IFLA_XDP_PROG_ID])); + + if (mode == XDP_ATTACHED_MULTI) { + xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic_prog_id"); + xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "drv_prog_id"); + xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload_prog_id"); + } + + NET_END_OBJECT_FINAL; + return 0; +} + +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb) +{ + if (!tb[IFLA_XDP]) + return 0; + + return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index, + nla_getattr_str(tb[IFLA_IFNAME])); +} + +static char *hexstring_n2a(const unsigned char *str, int len, + char *buf, int blen) +{ + char *ptr = buf; + int i; + + for (i = 0; i < len; i++) { + if (blen < 3) + break; + sprintf(ptr, "%02x", str[i]); + ptr += 2; + blen -= 2; + } + return buf; +} + +static int do_bpf_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + char buf[256]; + + if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (!tb[TCA_ACT_BPF_PARMS]) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_OBJECT_NESTED2; + if (tb[TCA_ACT_BPF_NAME]) + NET_DUMP_STR("name", nla_getattr_str(tb[TCA_ACT_BPF_NAME])); + if (tb[TCA_ACT_BPF_ID]) + NET_DUMP_UINT("bpf_id", nla_getattr_u32(tb[TCA_ACT_BPF_ID])); + if (tb[TCA_ACT_BPF_TAG]) + NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_ACT_BPF_TAG]), + nla_len(tb[TCA_ACT_BPF_TAG]), + buf, sizeof(buf))); + NET_END_OBJECT_NESTED; + return 0; +} + +static int do_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX + 1]; + + if (!attr) + return 0; + + if (nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_ACT_KIND] && strcmp(nla_data(tb[TCA_ACT_KIND]), "bpf") == 0) + return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]); + + return 0; +} + +static int do_bpf_act_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + int act, ret; + + if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_ARRAY("act", ""); + for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) { + ret = do_dump_one_act(tb[act]); + if (ret) + break; + } + NET_END_ARRAY(" "); + + return ret; +} + +static int do_bpf_filter_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_BPF_MAX + 1]; + char buf[256]; + int ret; + + if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_BPF_NAME]) + NET_DUMP_STR("name", nla_getattr_str(tb[TCA_BPF_NAME])); + if (tb[TCA_BPF_ID]) + NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[TCA_BPF_ID])); + if (tb[TCA_BPF_TAG]) + NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_BPF_TAG]), + nla_len(tb[TCA_BPF_TAG]), + buf, sizeof(buf))); + if (tb[TCA_BPF_ACT]) { + ret = do_bpf_act_dump(tb[TCA_BPF_ACT]); + if (ret) + return ret; + } + + return 0; +} + +int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind) +{ + int ret = 0; + + if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) { + NET_START_OBJECT; + NET_DUMP_UINT("ifindex", info->tcm_ifindex); + NET_DUMP_STR("kind", kind); + ret = do_bpf_filter_dump(tb[TCA_OPTIONS]); + NET_END_OBJECT_FINAL; + } + + return ret; +} diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h new file mode 100644 index 000000000000..552d8851ac06 --- /dev/null +++ b/tools/bpf/bpftool/netlink_dumper.h @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#ifndef _NETLINK_DUMPER_H_ +#define _NETLINK_DUMPER_H_ + +#define NET_START_OBJECT \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ +} + +#define NET_START_OBJECT_NESTED(name) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_object(json_wtr); \ + } else { \ + fprintf(stderr, "%s {", name); \ + } \ +} + +#define NET_START_OBJECT_NESTED2 \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ + else \ + fprintf(stderr, "{"); \ +} + +#define NET_END_OBJECT_NESTED \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stderr, "}"); \ +} + +#define NET_END_OBJECT \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ +} + +#define NET_END_OBJECT_FINAL \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stderr, "\n"); \ +} + +#define NET_START_ARRAY(name, newline) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_array(json_wtr); \ + } else { \ + fprintf(stderr, "%s [%s", name, newline);\ + } \ +} + +#define NET_END_ARRAY(endstr) \ +{ \ + if (json_output) \ + jsonw_end_array(json_wtr); \ + else \ + fprintf(stderr, "]%s", endstr); \ +} + +#define NET_DUMP_UINT(name, val) \ +{ \ + if (json_output) \ + jsonw_uint_field(json_wtr, name, val); \ + else \ + fprintf(stderr, "%s %d ", name, val); \ +} + +#define NET_DUMP_LLUINT(name, val) \ +{ \ + if (json_output) \ + jsonw_lluint_field(json_wtr, name, val);\ + else \ + fprintf(stderr, "%s %lld ", name, val); \ +} + +#define NET_DUMP_STR(name, str) \ +{ \ + if (json_output) \ + jsonw_string_field(json_wtr, name, str);\ + else \ + fprintf(stderr, "%s %s ", name, str); \ +} + +#define NET_DUMP_STR_ONLY(str) \ +{ \ + if (json_output) \ + jsonw_string(json_wtr, str); \ + else \ + fprintf(stderr, "%s ", str); \ +} + +#endif -- Gitee From 95289f76909d22e4ce52fca681b812ce4349e287 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:10 +0800 Subject: [PATCH 49/58] tools/bpf: fix a netlink recv issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 9d0b3c1f1451d1b9a33de3c70ae3d50ccd77db1a upstream Commit f7010770fbac ("tools/bpf: move bpf/lib netlink related functions into a new file") introduced a while loop for the netlink recv path. This while loop is needed since the buffer in recv syscall may not be enough to hold all the information and in such cases multiple recv calls are needed. There is a bug introduced by the above commit as the while loop may block on recv syscall if there is no more messages are expected. The netlink message header flag NLM_F_MULTI is used to indicate that more messages are expected and this patch fixed the bug by doing further recv syscall only if multipart message is expected. The patch added another fix regarding to message length of 0. When netlink recv returns message length of 0, there will be no more messages for returning data so the while loop can end. Fixes: f7010770fbac ("tools/bpf: move bpf/lib netlink related functions into a new file") Reported-by: Björn Töpel Tested-by: Björn Töpel Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/lib/bpf/netlink.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 469e068dd0c5..fde1d7bf8199 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -65,18 +65,23 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq, __dump_nlmsg_t _fn, dump_nlmsg_t fn, void *cookie) { + bool multipart = true; struct nlmsgerr *err; struct nlmsghdr *nh; char buf[4096]; int len, ret; - while (1) { + while (multipart) { + multipart = false; len = recv(sock, buf, sizeof(buf), 0); if (len < 0) { ret = -errno; goto done; } + if (len == 0) + break; + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); nh = NLMSG_NEXT(nh, len)) { if (nh->nlmsg_pid != nl_pid) { @@ -87,6 +92,8 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq, ret = -LIBBPF_ERRNO__INVSEQ; goto done; } + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; switch (nh->nlmsg_type) { case NLMSG_ERROR: err = (struct nlmsgerr *)NLMSG_DATA(nh); -- Gitee From 59394459ea70204704a998994184452da8af68ad Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:11 +0800 Subject: [PATCH 50/58] tools/bpf: bpftool: improve output format for bpftool net ANBZ: #5530 commit 7900efc19214e326913dc0f0e8ded24adc0018f2 upstream This is a followup patch for Commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support"). Some improvements are made for the bpftool net output. Specially, plain output is more concise such that per attachment should nicely fit in one line. Compared to previous output, the prog tag is removed since it can be easily obtained with program id. Similar to xdp attachments, the device name is added to tc attachments. The bpf program attached through shared block mechanism is supported as well. $ ip link add dev v1 type veth peer name v2 $ tc qdisc add dev v1 ingress_block 10 egress_block 20 clsact $ tc qdisc add dev v2 ingress_block 10 egress_block 20 clsact $ tc filter add block 10 protocol ip prio 25 bpf obj bpf_shared.o sec ingress flowid 1:1 $ tc filter add block 20 protocol ip prio 30 bpf obj bpf_cyclic.o sec classifier flowid 1:1 $ bpftool net xdp: tc: v2(7) clsact/ingress bpf_shared.o:[ingress] id 23 v2(7) clsact/egress bpf_cyclic.o:[classifier] id 24 v1(8) clsact/ingress bpf_shared.o:[ingress] id 23 v1(8) clsact/egress bpf_cyclic.o:[classifier] id 24 The documentation and "bpftool net help" are updated to make it clear that current implementation only supports xdp and tc attachments. For programs attached to cgroups, "bpftool cgroup" can be used to dump attachments. For other programs e.g. sk_{filter,skb,msg,reuseport} and lwt/seg6, iproute2 tools should be used. The new output: $ bpftool net xdp: eth0(2) driver id 198 tc: eth0(2) clsact/ingress fbflow_icmp id 335 act [{icmp_action id 336}] eth0(2) clsact/egress fbflow_egress id 334 $ bpftool -jp net [{ "xdp": [{ "devname": "eth0", "ifindex": 2, "mode": "driver", "id": 198 } ], "tc": [{ "devname": "eth0", "ifindex": 2, "kind": "clsact/ingress", "name": "fbflow_icmp", "id": 335, "act": [{ "name": "icmp_action", "id": 336 } ] },{ "devname": "eth0", "ifindex": 2, "kind": "clsact/egress", "name": "fbflow_egress", "id": 334 } ] } ] Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- .../bpf/bpftool/Documentation/bpftool-net.rst | 78 +++++++------ tools/bpf/bpftool/main.h | 3 +- tools/bpf/bpftool/net.c | 103 ++++++++++++------ tools/bpf/bpftool/netlink_dumper.c | 85 +++++++-------- tools/bpf/bpftool/netlink_dumper.h | 22 ++-- 5 files changed, 161 insertions(+), 130 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 48a61837a264..408ec30d8872 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -26,9 +26,20 @@ NET COMMANDS DESCRIPTION =========== **bpftool net { show | list } [ dev name ]** - List all networking device driver and tc attachment in the system. - - Output will start with all xdp program attachment, followed by + List bpf program attachments in the kernel networking subsystem. + + Currently, only device driver xdp attachments and tc filter + classification/action attachments are implemented, i.e., for + program types **BPF_PROG_TYPE_SCHED_CLS**, + **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**. + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + users can use **bpftool cgroup** to dump cgroup attachments. + For sk_{filter, skb, msg, reuseport} and lwt/seg6 + bpf programs, users should consult other tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by all tc class/qdisc bpf program attachments. Both xdp programs and tc programs are ordered based on ifindex number. If multiple bpf programs attached to the same networking device through **tc filter**, @@ -61,21 +72,15 @@ EXAMPLES :: - xdp [ - ifindex 2 devname eth0 prog_id 198 - ] - tc_filters [ - ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb] - prog_id 111727 tag d08fe3b4319bc2fd act [] - ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp - prog_id 130246 tag 3f265c7f26db62c9 act [] - ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact] - prog_id 111726 tag 99a197826974c876 - ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp - prog_id 108619 tag dc4630674fd72dcc act [] - ifindex 2 kind qdisc_clsact_egress name fbflow_egress - prog_id 130245 tag 72d2d830d6888d2c - ] + xdp: + eth0(2) driver id 198 + + tc: + eth0(2) htb name prefix_matcher.o:[cls_prefix_matcher_htb] id 111727 act [] + eth0(2) clsact/ingress fbflow_icmp id 130246 act [] + eth0(2) clsact/egress prefix_matcher.o:[cls_prefix_matcher_clsact] id 111726 + eth0(2) clsact/egress cls_fg_dscp id 108619 act [] + eth0(2) clsact/egress fbflow_egress id 130245 | | **# bpftool -jp net** @@ -84,44 +89,45 @@ EXAMPLES [{ "xdp": [{ - "ifindex": 2, "devname": "eth0", - "prog_id": 198 + "ifindex": 2, + "mode": "driver", + "id": 198 } ], - "tc_filters": [{ + "tc": [{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_htb", + "kind": "htb", "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", - "prog_id": 111727, - "tag": "d08fe3b4319bc2fd", + "id": 111727, "act": [] },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_ingress", + "kind": "clsact/ingress", "name": "fbflow_icmp", - "prog_id": 130246, - "tag": "3f265c7f26db62c9", + "id": 130246, "act": [] },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_egress", + "kind": "clsact/egress", "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", - "prog_id": 111726, - "tag": "99a197826974c876" + "id": 111726, },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_egress", + "kind": "clsact/egress", "name": "cls_fg_dscp", - "prog_id": 108619, - "tag": "dc4630674fd72dcc", + "id": 108619, "act": [] },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_egress", + "kind": "clsact/egress", "name": "fbflow_egress", - "prog_id": 130245, - "tag": "72d2d830d6888d2c" + "id": 130245, } ] } diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 0a9d2779635c..38413a16ab92 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -171,5 +171,6 @@ struct nlattr; struct ifinfomsg; struct tcmsg; int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb); -int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind); +int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind, + const char *devname, int ifindex); #endif diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 77dd73dd9ade..ed205ee57655 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -2,6 +2,7 @@ // Copyright (C) 2018 Facebook #define _GNU_SOURCE +#include #include #include #include @@ -17,8 +18,13 @@ #include "main.h" #include "netlink_dumper.h" +struct ip_devname_ifindex { + char devname[64]; + int ifindex; +}; + struct bpf_netdev_t { - int *ifindex_array; + struct ip_devname_ifindex *devices; int used_len; int array_len; int filter_idx; @@ -36,6 +42,12 @@ struct bpf_tcinfo_t { bool is_qdisc; }; +struct bpf_filter_t { + const char *kind; + const char *devname; + int ifindex; +}; + static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) { struct bpf_netdev_t *netinfo = cookie; @@ -45,11 +57,20 @@ static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) return 0; if (netinfo->used_len == netinfo->array_len) { - netinfo->ifindex_array = realloc(netinfo->ifindex_array, - (netinfo->array_len + 16) * sizeof(int)); + netinfo->devices = realloc(netinfo->devices, + (netinfo->array_len + 16) * + sizeof(struct ip_devname_ifindex)); + if (!netinfo->devices) + return -ENOMEM; + netinfo->array_len += 16; } - netinfo->ifindex_array[netinfo->used_len++] = ifinfo->ifi_index; + netinfo->devices[netinfo->used_len].ifindex = ifinfo->ifi_index; + snprintf(netinfo->devices[netinfo->used_len].devname, + sizeof(netinfo->devices[netinfo->used_len].devname), + "%s", + tb[IFLA_IFNAME] ? nla_getattr_str(tb[IFLA_IFNAME]) : ""); + netinfo->used_len++; return do_xdp_dump(ifinfo, tb); } @@ -71,13 +92,15 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) if (tcinfo->used_len == tcinfo->array_len) { tcinfo->handle_array = realloc(tcinfo->handle_array, (tcinfo->array_len + 16) * sizeof(struct tc_kind_handle)); + if (!tcinfo->handle_array) + return -ENOMEM; + tcinfo->array_len += 16; } tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle; snprintf(tcinfo->handle_array[tcinfo->used_len].kind, sizeof(tcinfo->handle_array[tcinfo->used_len].kind), - "%s_%s", - tcinfo->is_qdisc ? "qdisc" : "class", + "%s", tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown"); tcinfo->used_len++; @@ -86,60 +109,71 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb) { - const char *kind = cookie; + const struct bpf_filter_t *filter_info = cookie; - return do_filter_dump((struct tcmsg *)msg, tb, kind); + return do_filter_dump((struct tcmsg *)msg, tb, filter_info->kind, + filter_info->devname, filter_info->ifindex); } -static int show_dev_tc_bpf(int sock, unsigned int nl_pid, int ifindex) +static int show_dev_tc_bpf(int sock, unsigned int nl_pid, + struct ip_devname_ifindex *dev) { + struct bpf_filter_t filter_info; struct bpf_tcinfo_t tcinfo; - int i, handle, ret; + int i, handle, ret = 0; tcinfo.handle_array = NULL; tcinfo.used_len = 0; tcinfo.array_len = 0; tcinfo.is_qdisc = false; - ret = nl_get_class(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + ret = nl_get_class(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg, &tcinfo); if (ret) - return ret; + goto out; tcinfo.is_qdisc = true; - ret = nl_get_qdisc(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + ret = nl_get_qdisc(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg, &tcinfo); if (ret) - return ret; + goto out; + filter_info.devname = dev->devname; + filter_info.ifindex = dev->ifindex; for (i = 0; i < tcinfo.used_len; i++) { - ret = nl_get_filter(sock, nl_pid, ifindex, + filter_info.kind = tcinfo.handle_array[i].kind; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, tcinfo.handle_array[i].handle, dump_filter_nlmsg, - tcinfo.handle_array[i].kind); + &filter_info); if (ret) - return ret; + goto out; } /* root, ingress and egress handle */ handle = TC_H_ROOT; - ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, - "root"); + filter_info.kind = "root"; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); if (ret) - return ret; + goto out; handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); - ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, - "qdisc_clsact_ingress"); + filter_info.kind = "clsact/ingress"; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); if (ret) - return ret; + goto out; handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS); - ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, - "qdisc_clsact_egress"); + filter_info.kind = "clsact/egress"; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); if (ret) - return ret; + goto out; +out: + free(tcinfo.handle_array); return 0; } @@ -168,7 +202,7 @@ static int do_show(int argc, char **argv) return -1; } - dev_array.ifindex_array = NULL; + dev_array.devices = NULL; dev_array.used_len = 0; dev_array.array_len = 0; dev_array.filter_idx = filter_idx; @@ -176,15 +210,15 @@ static int do_show(int argc, char **argv) if (json_output) jsonw_start_array(json_wtr); NET_START_OBJECT; - NET_START_ARRAY("xdp", "\n"); + NET_START_ARRAY("xdp", "%s:\n"); ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array); NET_END_ARRAY("\n"); if (!ret) { - NET_START_ARRAY("tc_filters", "\n"); + NET_START_ARRAY("tc", "%s:\n"); for (i = 0; i < dev_array.used_len; i++) { ret = show_dev_tc_bpf(sock, nl_pid, - dev_array.ifindex_array[i]); + &dev_array.devices[i]); if (ret) break; } @@ -200,7 +234,7 @@ static int do_show(int argc, char **argv) libbpf_strerror(ret, err_buf, sizeof(err_buf)); fprintf(stderr, "Error: %s\n", err_buf); } - free(dev_array.ifindex_array); + free(dev_array.devices); close(sock); return ret; } @@ -214,7 +248,12 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [dev ]\n" - " %s %s help\n", + " %s %s help\n" + "Note: Only xdp and tc attachments are supported now.\n" + " For progs attached to cgroups, use \"bpftool cgroup\"\n" + " to dump program attachments. For program types\n" + " sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n" + " consult iproute2.\n", bin_name, argv[-2], bin_name, argv[-2]); return 0; diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c index e12494fd1d2e..6f5e9cc6836c 100644 --- a/tools/bpf/bpftool/netlink_dumper.c +++ b/tools/bpf/bpftool/netlink_dumper.c @@ -12,12 +12,18 @@ #include "netlink_dumper.h" static void xdp_dump_prog_id(struct nlattr **tb, int attr, - const char *type) + const char *mode, + bool new_json_object) { if (!tb[attr]) return; - NET_DUMP_UINT(type, nla_getattr_u32(tb[attr])) + if (new_json_object) + NET_START_OBJECT + NET_DUMP_STR("mode", " %s", mode); + NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[attr])) + if (new_json_object) + NET_END_OBJECT } static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, @@ -37,18 +43,26 @@ static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, return 0; NET_START_OBJECT; - NET_DUMP_UINT("ifindex", ifindex); - if (name) - NET_DUMP_STR("devname", name); - - if (tb[IFLA_XDP_PROG_ID]) - NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[IFLA_XDP_PROG_ID])); + NET_DUMP_STR("devname", "%s", name); + NET_DUMP_UINT("ifindex", "(%d)", ifindex); if (mode == XDP_ATTACHED_MULTI) { - xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic_prog_id"); - xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "drv_prog_id"); - xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload_prog_id"); + if (json_output) { + jsonw_name(json_wtr, "multi_attachments"); + jsonw_start_array(json_wtr); + } + xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic", true); + xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "driver", true); + xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload", true); + if (json_output) + jsonw_end_array(json_wtr); + } else if (mode == XDP_ATTACHED_DRV) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "driver", false); + } else if (mode == XDP_ATTACHED_SKB) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "generic", false); + } else if (mode == XDP_ATTACHED_HW) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "offload", false); } NET_END_OBJECT_FINAL; @@ -64,26 +78,9 @@ int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb) nla_getattr_str(tb[IFLA_IFNAME])); } -static char *hexstring_n2a(const unsigned char *str, int len, - char *buf, int blen) -{ - char *ptr = buf; - int i; - - for (i = 0; i < len; i++) { - if (blen < 3) - break; - sprintf(ptr, "%02x", str[i]); - ptr += 2; - blen -= 2; - } - return buf; -} - static int do_bpf_dump_one_act(struct nlattr *attr) { struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; - char buf[256]; if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0) return -LIBBPF_ERRNO__NLPARSE; @@ -93,13 +90,11 @@ static int do_bpf_dump_one_act(struct nlattr *attr) NET_START_OBJECT_NESTED2; if (tb[TCA_ACT_BPF_NAME]) - NET_DUMP_STR("name", nla_getattr_str(tb[TCA_ACT_BPF_NAME])); + NET_DUMP_STR("name", "%s", + nla_getattr_str(tb[TCA_ACT_BPF_NAME])); if (tb[TCA_ACT_BPF_ID]) - NET_DUMP_UINT("bpf_id", nla_getattr_u32(tb[TCA_ACT_BPF_ID])); - if (tb[TCA_ACT_BPF_TAG]) - NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_ACT_BPF_TAG]), - nla_len(tb[TCA_ACT_BPF_TAG]), - buf, sizeof(buf))); + NET_DUMP_UINT("id", " id %u", + nla_getattr_u32(tb[TCA_ACT_BPF_ID])); NET_END_OBJECT_NESTED; return 0; } @@ -128,13 +123,13 @@ static int do_bpf_act_dump(struct nlattr *attr) if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0) return -LIBBPF_ERRNO__NLPARSE; - NET_START_ARRAY("act", ""); + NET_START_ARRAY("act", " %s ["); for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) { ret = do_dump_one_act(tb[act]); if (ret) break; } - NET_END_ARRAY(" "); + NET_END_ARRAY("] "); return ret; } @@ -142,20 +137,15 @@ static int do_bpf_act_dump(struct nlattr *attr) static int do_bpf_filter_dump(struct nlattr *attr) { struct nlattr *tb[TCA_BPF_MAX + 1]; - char buf[256]; int ret; if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0) return -LIBBPF_ERRNO__NLPARSE; if (tb[TCA_BPF_NAME]) - NET_DUMP_STR("name", nla_getattr_str(tb[TCA_BPF_NAME])); + NET_DUMP_STR("name", " %s", nla_getattr_str(tb[TCA_BPF_NAME])); if (tb[TCA_BPF_ID]) - NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[TCA_BPF_ID])); - if (tb[TCA_BPF_TAG]) - NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_BPF_TAG]), - nla_len(tb[TCA_BPF_TAG]), - buf, sizeof(buf))); + NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[TCA_BPF_ID])); if (tb[TCA_BPF_ACT]) { ret = do_bpf_act_dump(tb[TCA_BPF_ACT]); if (ret) @@ -165,14 +155,17 @@ static int do_bpf_filter_dump(struct nlattr *attr) return 0; } -int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind) +int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind, + const char *devname, int ifindex) { int ret = 0; if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) { NET_START_OBJECT; - NET_DUMP_UINT("ifindex", info->tcm_ifindex); - NET_DUMP_STR("kind", kind); + if (devname[0] != '\0') + NET_DUMP_STR("devname", "%s", devname); + NET_DUMP_UINT("ifindex", "(%u)", ifindex); + NET_DUMP_STR("kind", " %s", kind); ret = do_bpf_filter_dump(tb[TCA_OPTIONS]); NET_END_OBJECT_FINAL; } diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h index 552d8851ac06..0788cfbbed0e 100644 --- a/tools/bpf/bpftool/netlink_dumper.h +++ b/tools/bpf/bpftool/netlink_dumper.h @@ -50,13 +50,13 @@ fprintf(stderr, "\n"); \ } -#define NET_START_ARRAY(name, newline) \ +#define NET_START_ARRAY(name, fmt_str) \ { \ if (json_output) { \ jsonw_name(json_wtr, name); \ jsonw_start_array(json_wtr); \ } else { \ - fprintf(stderr, "%s [%s", name, newline);\ + fprintf(stderr, fmt_str, name); \ } \ } @@ -65,31 +65,23 @@ if (json_output) \ jsonw_end_array(json_wtr); \ else \ - fprintf(stderr, "]%s", endstr); \ + fprintf(stderr, "%s", endstr); \ } -#define NET_DUMP_UINT(name, val) \ +#define NET_DUMP_UINT(name, fmt_str, val) \ { \ if (json_output) \ jsonw_uint_field(json_wtr, name, val); \ else \ - fprintf(stderr, "%s %d ", name, val); \ + fprintf(stderr, fmt_str, val); \ } -#define NET_DUMP_LLUINT(name, val) \ -{ \ - if (json_output) \ - jsonw_lluint_field(json_wtr, name, val);\ - else \ - fprintf(stderr, "%s %lld ", name, val); \ -} - -#define NET_DUMP_STR(name, str) \ +#define NET_DUMP_STR(name, fmt_str, str) \ { \ if (json_output) \ jsonw_string_field(json_wtr, name, str);\ else \ - fprintf(stderr, "%s %s ", name, str); \ + fprintf(stderr, fmt_str, str); \ } #define NET_DUMP_STR_ONLY(str) \ -- Gitee From 445e3ab88e458b9d3f55725ccbd64b372cf5220e Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:12 +0800 Subject: [PATCH 51/58] bpftool: Fix bpftool net output ANBZ: #5530 commit 53d6eb08e9f185834231d5c32499b10310cce3aa upstream Print `bpftool net` output to stdout instead of stderr. Only errors should be printed to stderr. Regular output should go to stdout and this is what all other subcommands of bpftool do, including --json and --pretty formats of `bpftool net` itself. Fixes: commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support") Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/bpf/bpftool/netlink_dumper.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h index 0788cfbbed0e..e3516b586a34 100644 --- a/tools/bpf/bpftool/netlink_dumper.h +++ b/tools/bpf/bpftool/netlink_dumper.h @@ -16,7 +16,7 @@ jsonw_name(json_wtr, name); \ jsonw_start_object(json_wtr); \ } else { \ - fprintf(stderr, "%s {", name); \ + fprintf(stdout, "%s {", name); \ } \ } @@ -25,7 +25,7 @@ if (json_output) \ jsonw_start_object(json_wtr); \ else \ - fprintf(stderr, "{"); \ + fprintf(stdout, "{"); \ } #define NET_END_OBJECT_NESTED \ @@ -33,7 +33,7 @@ if (json_output) \ jsonw_end_object(json_wtr); \ else \ - fprintf(stderr, "}"); \ + fprintf(stdout, "}"); \ } #define NET_END_OBJECT \ @@ -47,7 +47,7 @@ if (json_output) \ jsonw_end_object(json_wtr); \ else \ - fprintf(stderr, "\n"); \ + fprintf(stdout, "\n"); \ } #define NET_START_ARRAY(name, fmt_str) \ @@ -56,7 +56,7 @@ jsonw_name(json_wtr, name); \ jsonw_start_array(json_wtr); \ } else { \ - fprintf(stderr, fmt_str, name); \ + fprintf(stdout, fmt_str, name); \ } \ } @@ -65,7 +65,7 @@ if (json_output) \ jsonw_end_array(json_wtr); \ else \ - fprintf(stderr, "%s", endstr); \ + fprintf(stdout, "%s", endstr); \ } #define NET_DUMP_UINT(name, fmt_str, val) \ @@ -73,7 +73,7 @@ if (json_output) \ jsonw_uint_field(json_wtr, name, val); \ else \ - fprintf(stderr, fmt_str, val); \ + fprintf(stdout, fmt_str, val); \ } #define NET_DUMP_STR(name, fmt_str, str) \ @@ -81,7 +81,7 @@ if (json_output) \ jsonw_string_field(json_wtr, name, str);\ else \ - fprintf(stderr, fmt_str, str); \ + fprintf(stdout, fmt_str, str); \ } #define NET_DUMP_STR_ONLY(str) \ @@ -89,7 +89,7 @@ if (json_output) \ jsonw_string(json_wtr, str); \ else \ - fprintf(stderr, "%s ", str); \ + fprintf(stdout, "%s ", str); \ } #endif -- Gitee From f23b5d8363a27b4896bee1b11ec7712107b34306 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:13 +0800 Subject: [PATCH 52/58] tools/bpftool: copy a few net uapi headers to tools directory ANBZ: #5530 commit 49a249c387268882e8393dd1fafc51e3b21cb7a2 upstream Commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support") added certain networking support to bpftool. The implementation relies on a relatively recent uapi header file linux/tc_act/tc_bpf.h on the host which contains the marco definition of TCA_ACT_BPF_ID. Unfortunately, this is not the case for all distributions. See the email message below where rhel-7.2 does not have an up-to-date linux/tc_act/tc_bpf.h. https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1799211.html Further investigation found that linux/pkt_cls.h is also needed for macro TCA_BPF_TAG. This patch fixed the issue by copying linux/tc_act/tc_bpf.h and linux/pkt_cls.h from kernel include/uapi directory to tools/include/uapi directory so building the bpftool does not depend on host system for these files. Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support") Reported-by: kernel test robot Cc: Li Zhijian Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/include/uapi/linux/pkt_cls.h | 612 +++++++++++++++++++++++ tools/include/uapi/linux/tc_act/tc_bpf.h | 37 ++ 2 files changed, 649 insertions(+) create mode 100644 tools/include/uapi/linux/pkt_cls.h create mode 100644 tools/include/uapi/linux/tc_act/tc_bpf.h diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h new file mode 100644 index 000000000000..682cda46aa1b --- /dev/null +++ b/tools/include/uapi/linux/pkt_cls.h @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_CLS_H +#define __LINUX_PKT_CLS_H + +#include +#include + +#define TC_COOKIE_MAX_SIZE 16 + +/* Action attributes */ +enum { + TCA_ACT_UNSPEC, + TCA_ACT_KIND, + TCA_ACT_OPTIONS, + TCA_ACT_INDEX, + TCA_ACT_STATS, + TCA_ACT_PAD, + TCA_ACT_COOKIE, + __TCA_ACT_MAX +}; + +#define TCA_ACT_MAX __TCA_ACT_MAX +#define TCA_OLD_COMPAT (TCA_ACT_MAX+1) +#define TCA_ACT_MAX_PRIO 32 +#define TCA_ACT_BIND 1 +#define TCA_ACT_NOBIND 0 +#define TCA_ACT_UNBIND 1 +#define TCA_ACT_NOUNBIND 0 +#define TCA_ACT_REPLACE 1 +#define TCA_ACT_NOREPLACE 0 + +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN + +/* Action type identifiers*/ +enum { + TCA_ID_UNSPEC=0, + TCA_ID_POLICE=1, + /* other actions go here */ + __TCA_ID_MAX=255 +}; + +#define TCA_ID_MAX __TCA_ID_MAX + +struct tc_police { + __u32 index; + int action; +#define TC_POLICE_UNSPEC TC_ACT_UNSPEC +#define TC_POLICE_OK TC_ACT_OK +#define TC_POLICE_RECLASSIFY TC_ACT_RECLASSIFY +#define TC_POLICE_SHOT TC_ACT_SHOT +#define TC_POLICE_PIPE TC_ACT_PIPE + + __u32 limit; + __u32 burst; + __u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + int refcnt; + int bindcnt; + __u32 capab; +}; + +struct tcf_t { + __u64 install; + __u64 lastuse; + __u64 expires; + __u64 firstuse; +}; + +struct tc_cnt { + int refcnt; + int bindcnt; +}; + +#define tc_gen \ + __u32 index; \ + __u32 capab; \ + int action; \ + int refcnt; \ + int bindcnt + +enum { + TCA_POLICE_UNSPEC, + TCA_POLICE_TBF, + TCA_POLICE_RATE, + TCA_POLICE_PEAKRATE, + TCA_POLICE_AVRATE, + TCA_POLICE_RESULT, + TCA_POLICE_TM, + TCA_POLICE_PAD, + __TCA_POLICE_MAX +#define TCA_POLICE_RESULT TCA_POLICE_RESULT +}; + +#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1) + +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW (1 << 0) /* don't offload filter to HW */ +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ +#define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ +#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ + +/* U32 filters */ + +#define TC_U32_HTID(h) ((h)&0xFFF00000) +#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20) +#define TC_U32_HASH(h) (((h)>>12)&0xFF) +#define TC_U32_NODE(h) ((h)&0xFFF) +#define TC_U32_KEY(h) ((h)&0xFFFFF) +#define TC_U32_UNSPEC 0 +#define TC_U32_ROOT (0xFFF00000) + +enum { + TCA_U32_UNSPEC, + TCA_U32_CLASSID, + TCA_U32_HASH, + TCA_U32_LINK, + TCA_U32_DIVISOR, + TCA_U32_SEL, + TCA_U32_POLICE, + TCA_U32_ACT, + TCA_U32_INDEV, + TCA_U32_PCNT, + TCA_U32_MARK, + TCA_U32_FLAGS, + TCA_U32_PAD, + __TCA_U32_MAX +}; + +#define TCA_U32_MAX (__TCA_U32_MAX - 1) + +struct tc_u32_key { + __be32 mask; + __be32 val; + int off; + int offmask; +}; + +struct tc_u32_sel { + unsigned char flags; + unsigned char offshift; + unsigned char nkeys; + + __be16 offmask; + __u16 off; + short offoff; + + short hoff; + __be32 hmask; + struct tc_u32_key keys[0]; +}; + +struct tc_u32_mark { + __u32 val; + __u32 mask; + __u32 success; +}; + +struct tc_u32_pcnt { + __u64 rcnt; + __u64 rhit; + __u64 kcnts[0]; +}; + +/* Flags */ + +#define TC_U32_TERMINAL 1 +#define TC_U32_OFFSET 2 +#define TC_U32_VAROFFSET 4 +#define TC_U32_EAT 8 + +#define TC_U32_MAXDEPTH 8 + + +/* RSVP filter */ + +enum { + TCA_RSVP_UNSPEC, + TCA_RSVP_CLASSID, + TCA_RSVP_DST, + TCA_RSVP_SRC, + TCA_RSVP_PINFO, + TCA_RSVP_POLICE, + TCA_RSVP_ACT, + __TCA_RSVP_MAX +}; + +#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 ) + +struct tc_rsvp_gpi { + __u32 key; + __u32 mask; + int offset; +}; + +struct tc_rsvp_pinfo { + struct tc_rsvp_gpi dpi; + struct tc_rsvp_gpi spi; + __u8 protocol; + __u8 tunnelid; + __u8 tunnelhdr; + __u8 pad; +}; + +/* ROUTE filter */ + +enum { + TCA_ROUTE4_UNSPEC, + TCA_ROUTE4_CLASSID, + TCA_ROUTE4_TO, + TCA_ROUTE4_FROM, + TCA_ROUTE4_IIF, + TCA_ROUTE4_POLICE, + TCA_ROUTE4_ACT, + __TCA_ROUTE4_MAX +}; + +#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1) + + +/* FW filter */ + +enum { + TCA_FW_UNSPEC, + TCA_FW_CLASSID, + TCA_FW_POLICE, + TCA_FW_INDEV, /* used by CONFIG_NET_CLS_IND */ + TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ + TCA_FW_MASK, + __TCA_FW_MAX +}; + +#define TCA_FW_MAX (__TCA_FW_MAX - 1) + +/* TC index filter */ + +enum { + TCA_TCINDEX_UNSPEC, + TCA_TCINDEX_HASH, + TCA_TCINDEX_MASK, + TCA_TCINDEX_SHIFT, + TCA_TCINDEX_FALL_THROUGH, + TCA_TCINDEX_CLASSID, + TCA_TCINDEX_POLICE, + TCA_TCINDEX_ACT, + __TCA_TCINDEX_MAX +}; + +#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1) + +/* Flow filter */ + +enum { + FLOW_KEY_SRC, + FLOW_KEY_DST, + FLOW_KEY_PROTO, + FLOW_KEY_PROTO_SRC, + FLOW_KEY_PROTO_DST, + FLOW_KEY_IIF, + FLOW_KEY_PRIORITY, + FLOW_KEY_MARK, + FLOW_KEY_NFCT, + FLOW_KEY_NFCT_SRC, + FLOW_KEY_NFCT_DST, + FLOW_KEY_NFCT_PROTO_SRC, + FLOW_KEY_NFCT_PROTO_DST, + FLOW_KEY_RTCLASSID, + FLOW_KEY_SKUID, + FLOW_KEY_SKGID, + FLOW_KEY_VLAN_TAG, + FLOW_KEY_RXHASH, + __FLOW_KEY_MAX, +}; + +#define FLOW_KEY_MAX (__FLOW_KEY_MAX - 1) + +enum { + FLOW_MODE_MAP, + FLOW_MODE_HASH, +}; + +enum { + TCA_FLOW_UNSPEC, + TCA_FLOW_KEYS, + TCA_FLOW_MODE, + TCA_FLOW_BASECLASS, + TCA_FLOW_RSHIFT, + TCA_FLOW_ADDEND, + TCA_FLOW_MASK, + TCA_FLOW_XOR, + TCA_FLOW_DIVISOR, + TCA_FLOW_ACT, + TCA_FLOW_POLICE, + TCA_FLOW_EMATCHES, + TCA_FLOW_PERTURB, + __TCA_FLOW_MAX +}; + +#define TCA_FLOW_MAX (__TCA_FLOW_MAX - 1) + +/* Basic filter */ + +enum { + TCA_BASIC_UNSPEC, + TCA_BASIC_CLASSID, + TCA_BASIC_EMATCHES, + TCA_BASIC_ACT, + TCA_BASIC_POLICE, + __TCA_BASIC_MAX +}; + +#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1) + + +/* Cgroup classifier */ + +enum { + TCA_CGROUP_UNSPEC, + TCA_CGROUP_ACT, + TCA_CGROUP_POLICE, + TCA_CGROUP_EMATCHES, + __TCA_CGROUP_MAX, +}; + +#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1) + +/* BPF classifier */ + +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, + TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, + TCA_BPF_TAG, + TCA_BPF_ID, + __TCA_BPF_MAX, +}; + +#define TCA_BPF_MAX (__TCA_BPF_MAX - 1) + +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + + TCA_FLOWER_FLAGS, + TCA_FLOWER_KEY_VLAN_ID, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + + TCA_FLOWER_KEY_TCP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST_MASK, /* be16 */ + + TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST, /* be16 */ + + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + + TCA_FLOWER_KEY_ARP_SIP, /* be32 */ + TCA_FLOWER_KEY_ARP_SIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_OP, /* u8 */ + TCA_FLOWER_KEY_ARP_OP_MASK, /* u8 */ + TCA_FLOWER_KEY_ARP_SHA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_SHA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + + TCA_FLOWER_IN_HW_COUNT, + + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), +}; + +/* Match-all classifier */ + +enum { + TCA_MATCHALL_UNSPEC, + TCA_MATCHALL_CLASSID, + TCA_MATCHALL_ACT, + TCA_MATCHALL_FLAGS, + __TCA_MATCHALL_MAX, +}; + +#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1) + +/* Extended Matches */ + +struct tcf_ematch_tree_hdr { + __u16 nmatches; + __u16 progid; +}; + +enum { + TCA_EMATCH_TREE_UNSPEC, + TCA_EMATCH_TREE_HDR, + TCA_EMATCH_TREE_LIST, + __TCA_EMATCH_TREE_MAX +}; +#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1) + +struct tcf_ematch_hdr { + __u16 matchid; + __u16 kind; + __u16 flags; + __u16 pad; /* currently unused */ +}; + +/* 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + * +-----------------------+-+-+---+ + * | Unused |S|I| R | + * +-----------------------+-+-+---+ + * + * R(2) ::= relation to next ematch + * where: 0 0 END (last ematch) + * 0 1 AND + * 1 0 OR + * 1 1 Unused (invalid) + * I(1) ::= invert result + * S(1) ::= simple payload + */ +#define TCF_EM_REL_END 0 +#define TCF_EM_REL_AND (1<<0) +#define TCF_EM_REL_OR (1<<1) +#define TCF_EM_INVERT (1<<2) +#define TCF_EM_SIMPLE (1<<3) + +#define TCF_EM_REL_MASK 3 +#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK) + +enum { + TCF_LAYER_LINK, + TCF_LAYER_NETWORK, + TCF_LAYER_TRANSPORT, + __TCF_LAYER_MAX +}; +#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1) + +/* Ematch type assignments + * 1..32767 Reserved for ematches inside kernel tree + * 32768..65535 Free to use, not reliable + */ +#define TCF_EM_CONTAINER 0 +#define TCF_EM_CMP 1 +#define TCF_EM_NBYTE 2 +#define TCF_EM_U32 3 +#define TCF_EM_META 4 +#define TCF_EM_TEXT 5 +#define TCF_EM_VLAN 6 +#define TCF_EM_CANID 7 +#define TCF_EM_IPSET 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 + +enum { + TCF_EM_PROG_TC +}; + +enum { + TCF_EM_OPND_EQ, + TCF_EM_OPND_GT, + TCF_EM_OPND_LT +}; + +#endif diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h new file mode 100644 index 000000000000..6e89a5df49a4 --- /dev/null +++ b/tools/include/uapi/linux/tc_act/tc_bpf.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (c) 2015 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __LINUX_TC_BPF_H +#define __LINUX_TC_BPF_H + +#include + +#define TCA_ACT_BPF 13 + +struct tc_act_bpf { + tc_gen; +}; + +enum { + TCA_ACT_BPF_UNSPEC, + TCA_ACT_BPF_TM, + TCA_ACT_BPF_PARMS, + TCA_ACT_BPF_OPS_LEN, + TCA_ACT_BPF_OPS, + TCA_ACT_BPF_FD, + TCA_ACT_BPF_NAME, + TCA_ACT_BPF_PAD, + TCA_ACT_BPF_TAG, + TCA_ACT_BPF_ID, + __TCA_ACT_BPF_MAX, +}; +#define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) + +#endif -- Gitee From 7a186b3d28f9f307cafe5ccd1c089e83eefabe58 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:14 +0800 Subject: [PATCH 53/58] bpf: pull in pkt_sched.h header for tooling to fix bpftool build ANBZ: #5530 commit ad6dd7a9c47ba587c0aba57f73737764cd31136f upstream Dan reported that bpftool does not compile for him: $ make tools/bpf DESCEND bpf Auto-detecting system features: .. libbfd: [ on ] .. disassembler-four-args: [ OFF ] DESCEND bpftool Auto-detecting system features: .. libbfd: [ on ] .. disassembler-four-args: [ OFF ] CC /opt/linux.git/tools/bpf/bpftool/net.o In file included from /opt/linux.git/tools/include/uapi/linux/pkt_cls.h:6:0, from /opt/linux.git/tools/include/uapi/linux/tc_act/tc_bpf.h:14, from net.c:13: net.c: In function 'show_dev_tc_bpf': net.c:164:21: error: 'TC_H_CLSACT' undeclared (first use in this function) handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); [...] Fix it by importing pkt_sched.h header copy into tooling infrastructure. Fixes: 49a249c38726 ("tools/bpftool: copy a few net uapi headers to tools directory") Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support") Reported-by: Dan Gilson Reference: https://bugzilla.kernel.org/show_bug.cgi?id=202315 Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/include/uapi/linux/pkt_sched.h | 1163 ++++++++++++++++++++++++++ 1 file changed, 1163 insertions(+) create mode 100644 tools/include/uapi/linux/pkt_sched.h diff --git a/tools/include/uapi/linux/pkt_sched.h b/tools/include/uapi/linux/pkt_sched.h new file mode 100644 index 000000000000..25060205bf3f --- /dev/null +++ b/tools/include/uapi/linux/pkt_sched.h @@ -0,0 +1,1163 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#include + +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. + + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. + */ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + +#define TC_PRIO_MAX 15 + +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. + */ + +struct tc_stats { + __u64 bytes; /* Number of enqueued bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; +}; + +struct tc_estimator { + signed char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: + */ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) +#define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_PRIORITY 0xFFE0U +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U + +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + +struct tc_ratespec { + unsigned char cell_log; + __u8 linklayer; /* lower 4 bits */ + unsigned short overhead; + short cell_align; + unsigned short mpu; + __u32 rate; +}; + +#define TC_RTAB_SIZE 1024 + +struct tc_sizespec { + unsigned char cell_log; + unsigned char size_log; + short cell_align; + int overhead; + unsigned int linklayer; + unsigned int mpu; + unsigned int mtu; + unsigned int tsize; +}; + +enum { + TCA_STAB_UNSPEC, + TCA_STAB_BASE, + TCA_STAB_DATA, + __TCA_STAB_MAX +}; + +#define TCA_STAB_MAX (__TCA_STAB_MAX - 1) + +/* FIFO section */ + +struct tc_fifo_qopt { + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 + +struct tc_prio_qopt { + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; + +/* MULTIQ section */ + +struct tc_multiq_qopt { + __u16 bands; /* Number of bands */ + __u16 max_bands; /* Maximum number of queues */ +}; + +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + +/* TBF section */ + +struct tc_tbf_qopt { + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum { + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, + TCA_TBF_RATE64, + TCA_TBF_PRATE64, + TCA_TBF_BURST, + TCA_TBF_PBURST, + TCA_TBF_PAD, + __TCA_TBF_MAX, +}; + +#define TCA_TBF_MAX (__TCA_TBF_MAX - 1) + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + +/* SFQ section */ + +struct tc_sfq_qopt { + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ +}; + +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; +/* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ +/* SFQRED stats */ + struct tc_sfqred_stats stats; +}; + + +struct tc_sfq_xstats { + __s32 allot; +}; + +/* RED section */ + +enum { + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, + TCA_RED_MAX_P, + __TCA_RED_MAX, +}; + +#define TCA_RED_MAX (__TCA_RED_MAX - 1) + +struct tc_red_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 +}; + +struct tc_red_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ +}; + +/* GRED section */ + +#define MAX_DPs 16 + +enum { + TCA_GRED_UNSPEC, + TCA_GRED_PARMS, + TCA_GRED_STAB, + TCA_GRED_DPS, + TCA_GRED_MAX_P, + TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ + __TCA_GRED_MAX, +}; + +#define TCA_GRED_MAX (__TCA_GRED_MAX - 1) + +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + +struct tc_gred_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* up to 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; +}; + +/* gred setup */ +struct tc_gred_sopt { + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; +}; + +/* CHOKe section */ + +enum { + TCA_CHOKE_UNSPEC, + TCA_CHOKE_PARMS, + TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, + __TCA_CHOKE_MAX, +}; + +#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) + +struct tc_choke_qopt { + __u32 limit; /* Hard queue length (packets) */ + __u32 qth_min; /* Min average threshold (packets) */ + __u32 qth_max; /* Max average threshold (packets) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; /* see RED flags */ +}; + +struct tc_choke_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ + __u32 matched; /* Drops due to flow match */ +}; + +/* HTB section */ +#define TC_HTB_NUMPRIO 8 +#define TC_HTB_MAXDEPTH 8 +#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ + +struct tc_htb_opt { + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 buffer; + __u32 cbuffer; + __u32 quantum; + __u32 level; /* out only */ + __u32 prio; +}; +struct tc_htb_glob { + __u32 version; /* to match HTB/TC */ + __u32 rate2quantum; /* bps->quantum divisor */ + __u32 defcls; /* default class number */ + __u32 debug; /* debug flags */ + + /* stats */ + __u32 direct_pkts; /* count of non shaped packets */ +}; +enum { + TCA_HTB_UNSPEC, + TCA_HTB_PARMS, + TCA_HTB_INIT, + TCA_HTB_CTAB, + TCA_HTB_RTAB, + TCA_HTB_DIRECT_QLEN, + TCA_HTB_RATE64, + TCA_HTB_CEIL64, + TCA_HTB_PAD, + __TCA_HTB_MAX, +}; + +#define TCA_HTB_MAX (__TCA_HTB_MAX - 1) + +struct tc_htb_xstats { + __u32 lends; + __u32 borrows; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; +}; + +/* HFSC section */ + +struct tc_hfsc_qopt { + __u16 defcls; /* default class */ +}; + +struct tc_service_curve { + __u32 m1; /* slope of the first segment in bps */ + __u32 d; /* x-projection of the first segment in us */ + __u32 m2; /* slope of the second segment in bps */ +}; + +struct tc_hfsc_stats { + __u64 work; /* total work done */ + __u64 rtwork; /* work done by real-time criteria */ + __u32 period; /* current period */ + __u32 level; /* class level in hierarchy */ +}; + +enum { + TCA_HFSC_UNSPEC, + TCA_HFSC_RSC, + TCA_HFSC_FSC, + TCA_HFSC_USC, + __TCA_HFSC_MAX, +}; + +#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) + + +/* CBQ section */ + +#define TC_CBQ_MAXPRIO 8 +#define TC_CBQ_MAXLEVEL 8 +#define TC_CBQ_DEF_EWMA 5 + +struct tc_cbq_lssopt { + unsigned char change; + unsigned char flags; +#define TCF_CBQ_LSS_BOUNDED 1 +#define TCF_CBQ_LSS_ISOLATED 2 + unsigned char ewma_log; + unsigned char level; +#define TCF_CBQ_LSS_FLAGS 1 +#define TCF_CBQ_LSS_EWMA 2 +#define TCF_CBQ_LSS_MAXIDLE 4 +#define TCF_CBQ_LSS_MINIDLE 8 +#define TCF_CBQ_LSS_OFFTIME 0x10 +#define TCF_CBQ_LSS_AVPKT 0x20 + __u32 maxidle; + __u32 minidle; + __u32 offtime; + __u32 avpkt; +}; + +struct tc_cbq_wrropt { + unsigned char flags; + unsigned char priority; + unsigned char cpriority; + unsigned char __reserved; + __u32 allot; + __u32 weight; +}; + +struct tc_cbq_ovl { + unsigned char strategy; +#define TC_CBQ_OVL_CLASSIC 0 +#define TC_CBQ_OVL_DELAY 1 +#define TC_CBQ_OVL_LOWPRIO 2 +#define TC_CBQ_OVL_DROP 3 +#define TC_CBQ_OVL_RCLASSIC 4 + unsigned char priority2; + __u16 pad; + __u32 penalty; +}; + +struct tc_cbq_police { + unsigned char police; + unsigned char __res1; + unsigned short __res2; +}; + +struct tc_cbq_fopt { + __u32 split; + __u32 defmap; + __u32 defchange; +}; + +struct tc_cbq_xstats { + __u32 borrows; + __u32 overactions; + __s32 avgidle; + __s32 undertime; +}; + +enum { + TCA_CBQ_UNSPEC, + TCA_CBQ_LSSOPT, + TCA_CBQ_WRROPT, + TCA_CBQ_FOPT, + TCA_CBQ_OVL_STRATEGY, + TCA_CBQ_RATE, + TCA_CBQ_RTAB, + TCA_CBQ_POLICE, + __TCA_CBQ_MAX, +}; + +#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) + +/* dsmark section */ + +enum { + TCA_DSMARK_UNSPEC, + TCA_DSMARK_INDICES, + TCA_DSMARK_DEFAULT_INDEX, + TCA_DSMARK_SET_TC_INDEX, + TCA_DSMARK_MASK, + TCA_DSMARK_VALUE, + __TCA_DSMARK_MAX, +}; + +#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) + +/* ATM section */ + +enum { + TCA_ATM_UNSPEC, + TCA_ATM_FD, /* file/socket descriptor */ + TCA_ATM_PTR, /* pointer to descriptor - later */ + TCA_ATM_HDR, /* LL header */ + TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ + TCA_ATM_ADDR, /* PVC address (for output only) */ + TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ + __TCA_ATM_MAX, +}; + +#define TCA_ATM_MAX (__TCA_ATM_MAX - 1) + +/* Network emulator */ + +enum { + TCA_NETEM_UNSPEC, + TCA_NETEM_CORR, + TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, + TCA_NETEM_LOSS, + TCA_NETEM_RATE, + TCA_NETEM_ECN, + TCA_NETEM_RATE64, + TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, + __TCA_NETEM_MAX, +}; + +#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) + +struct tc_netem_qopt { + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for none) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 jitter; /* random jitter in latency (us) */ +}; + +struct tc_netem_corr { + __u32 delay_corr; /* delay correlation */ + __u32 loss_corr; /* packet loss correlation */ + __u32 dup_corr; /* duplicate correlation */ +}; + +struct tc_netem_reorder { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_corrupt { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_rate { + __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; +}; + +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ +}; + +enum { + NETEM_LOSS_UNSPEC, + NETEM_LOSS_GI, /* General Intuitive - 4 state model */ + NETEM_LOSS_GE, /* Gilbert Elliot models */ + __NETEM_LOSS_MAX +}; +#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) + +/* State transition probabilities for 4 state model */ +struct tc_netem_gimodel { + __u32 p13; + __u32 p31; + __u32 p32; + __u32 p14; + __u32 p23; +}; + +/* Gilbert-Elliot models */ +struct tc_netem_gemodel { + __u32 p; + __u32 r; + __u32 h; + __u32 k1; +}; + +#define NETEM_DIST_SCALE 8192 +#define NETEM_DIST_MAX 16384 + +/* DRR */ + +enum { + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats { + __u32 deficit; +}; + +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + +/* SFB */ + +enum { + TCA_SFB_UNSPEC, + TCA_SFB_PARMS, + __TCA_SFB_MAX, +}; + +#define TCA_SFB_MAX (__TCA_SFB_MAX - 1) + +/* + * Note: increment, decrement are Q0.16 fixed-point values. + */ +struct tc_sfb_qopt { + __u32 rehash_interval; /* delay between hash move, in ms */ + __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ + __u32 max; /* max len of qlen_min */ + __u32 bin_size; /* maximum queue length per bin */ + __u32 increment; /* probability increment, (d1 in Blue) */ + __u32 decrement; /* probability decrement, (d2 in Blue) */ + __u32 limit; /* max SFB queue length */ + __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ + __u32 penalty_burst; +}; + +struct tc_sfb_xstats { + __u32 earlydrop; + __u32 penaltydrop; + __u32 bucketdrop; + __u32 queuedrop; + __u32 childdrop; /* drops in child qdisc */ + __u32 marked; + __u32 maxqlen; + __u32 maxprob; + __u32 avgprob; +}; + +#define SFB_MAX_PROB 0xFFFF + +/* QFQ */ +enum { + TCA_QFQ_UNSPEC, + TCA_QFQ_WEIGHT, + TCA_QFQ_LMAX, + __TCA_QFQ_MAX +}; + +#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) + +struct tc_qfq_stats { + __u32 weight; + __u32 lmax; +}; + +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_INTERVAL, + TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 count; /* how many drops we've done since the last time we + * entered dropping state + */ + __u32 lastcount; /* count at entry to dropping state */ + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ + __s32 drop_next; /* time to drop next packet */ + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ + __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ +}; + +/* FQ_CODEL */ + +enum { + TCA_FQ_CODEL_UNSPEC, + TCA_FQ_CODEL_TARGET, + TCA_FQ_CODEL_LIMIT, + TCA_FQ_CODEL_INTERVAL, + TCA_FQ_CODEL_ECN, + TCA_FQ_CODEL_FLOWS, + TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, + TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, + __TCA_FQ_CODEL_MAX +}; + +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) + +enum { + TCA_FQ_CODEL_XSTATS_QDISC, + TCA_FQ_CODEL_XSTATS_CLASS, +}; + +struct tc_fq_codel_qd_stats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 drop_overlimit; /* number of time max qdisc + * packet limit was hit + */ + __u32 ecn_mark; /* number of packets we ECN marked + * instead of being dropped + */ + __u32 new_flow_count; /* number of time packets + * created a 'new flow' + */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; +}; + +struct tc_fq_codel_cl_stats { + __s32 deficit; + __u32 ldelay; /* in-queue delay seen by most recently + * dequeued packet + */ + __u32 count; + __u32 lastcount; + __u32 dropping; + __s32 drop_next; +}; + +struct tc_fq_codel_xstats { + __u32 type; + union { + struct tc_fq_codel_qd_stats qdisc_stats; + struct tc_fq_codel_cl_stats class_stats; + }; +}; + +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + + TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ + + TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ +}; + +/* Heavy-Hitter Filter */ + +enum { + TCA_HHF_UNSPEC, + TCA_HHF_BACKLOG_LIMIT, + TCA_HHF_QUANTUM, + TCA_HHF_HH_FLOWS_LIMIT, + TCA_HHF_RESET_TIMEOUT, + TCA_HHF_ADMIT_BYTES, + TCA_HHF_EVICT_TIMEOUT, + TCA_HHF_NON_HH_WEIGHT, + __TCA_HHF_MAX +}; + +#define TCA_HHF_MAX (__TCA_HHF_MAX - 1) + +struct tc_hhf_xstats { + __u32 drop_overlimit; /* number of times max qdisc packet limit + * was hit + */ + __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ + __u32 hh_tot_count; /* number of captured heavy-hitters so far */ + __u32 hh_cur_count; /* number of current heavy-hitters */ +}; + +/* PIE */ +enum { + TCA_PIE_UNSPEC, + TCA_PIE_TARGET, + TCA_PIE_LIMIT, + TCA_PIE_TUPDATE, + TCA_PIE_ALPHA, + TCA_PIE_BETA, + TCA_PIE_ECN, + TCA_PIE_BYTEMODE, + __TCA_PIE_MAX +}; +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) + +struct tc_pie_xstats { + __u32 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ +}; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + +#endif -- Gitee From 313f0df54730f12e1b2bb67f26584d09a75c8771 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:15 +0800 Subject: [PATCH 54/58] bpftool: Fix compilation failure for net.o with older glibc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 6f02b540d7597f357bc6ee711346761045d4e108 upstream For older glibc ~2.17, #include'ing both linux/if.h and net/if.h fails due to complaints about redefinition of interface flags: CC net.o In file included from net.c:13:0: /usr/include/linux/if.h:71:2: error: redeclaration of enumerator ‘IFF_UP’ IFF_UP = 1<<0, /* sysfs */ ^ /usr/include/net/if.h:44:5: note: previous definition of ‘IFF_UP’ was here IFF_UP = 0x1, /* Interface is up. */ The issue was fixed in kernel headers in [1], but since compilation of net.c picks up system headers the problem can recur. Dropping #include resolves the issue and it is not needed for compilation anyhow. [1] https://lore.kernel.org/netdev/1461512707-23058-1-git-send-email-mikko.rapeli__34748.27880641$1462831734$gmane$org@iki.fi/ Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/1609948746-15369-1-git-send-email-alan.maguire@oracle.com Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/bpf/bpftool/net.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index ed205ee57655..63b47b8e5e41 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include -- Gitee From 11222a3701d7b820435ada8a06cb21fe7fb2989b Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:15 +0800 Subject: [PATCH 55/58] bpf: add bpffs pretty print for program array map ANBZ: #5530 commit a7c19db38d62fc1ce797dba19936e9f81cf2b9fb upstream Added bpffs pretty print for program array map. For a particular array index, if the program array points to a valid program, the ": " will be printed out like 0: 6 which means bpf program with id "6" is installed at index "0". Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- kernel/bpf/arraymap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0c17aab3ce5f..1b2cccff5983 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -529,6 +529,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } +static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void **elem, *ptr; + u32 prog_id; + + rcu_read_lock(); + + elem = array_map_lookup_elem(map, key); + if (elem) { + ptr = READ_ONCE(*elem); + if (ptr) { + seq_printf(m, "%u: ", *(u32 *)key); + prog_id = prog_fd_array_sys_lookup_elem(ptr); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &prog_id, m); + seq_puts(m, "\n"); + } + } + + rcu_read_unlock(); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -540,7 +563,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, - .map_check_btf = map_check_no_btf, + .map_seq_show_elem = prog_array_map_seq_show_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, -- Gitee From 53cae1fca0d64309a3a0a0ca467bc286b39c0a36 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:16 +0800 Subject: [PATCH 56/58] tools/bpf: bpftool: support prog array map and map of maps ANBZ: #5530 commit ad3338d2508cd7752accdd39881deced1ec2b8a1 upstream Currently, prog array map and map of maps are not supported in bpftool. This patch added the support. Different from other map types, for prog array map and map of maps, the key returned bpf_get_next_key() may not point to a valid value. So for these two map types, no error will be printed out when such a scenario happens. The following is the plain and json dump if btf is not available: $ ./bpftool map dump id 10 key: 08 00 00 00 value: 5c 01 00 00 Found 1 element $ ./bpftool -jp map dump id 10 [{ "key": ["0x08","0x00","0x00","0x00" ], "value": ["0x5c","0x01","0x00","0x00" ] }] If the BTF is available, the dump looks below: $ ./bpftool map dump id 2 [{ "key": 0, "value": 7 } ] $ ./bpftool -jp map dump id 2 [{ "key": ["0x00","0x00","0x00","0x00" ], "value": ["0x07","0x00","0x00","0x00" ], "formatted": { "key": 0, "value": 7 } }] Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: zhaoxuezhi Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1831 --- tools/bpf/bpftool/map.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index ec73d83d0d31..f41e2907a336 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -660,12 +660,6 @@ static int do_dump(int argc, char **argv) if (fd < 0) return -1; - if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) { - p_err("Dumping maps of maps and program maps not supported"); - close(fd); - return -1; - } - key = malloc(info.key_size); value = alloc_value(&info); if (!key || !value) { @@ -719,7 +713,9 @@ static int do_dump(int argc, char **argv) } else { print_entry_plain(&info, key, value); } - } else { + num_elems++; + } else if (!map_is_map_of_maps(info.type) && + !map_is_map_of_progs(info.type)) { if (json_output) { jsonw_name(json_wtr, "key"); print_hex_data_json(key, info.key_size); @@ -736,7 +732,6 @@ static int do_dump(int argc, char **argv) } prev_key = key; - num_elems++; } if (json_output) -- Gitee From 48e49bbfb870fb2f99ea427b8eba26bcd9d1011a Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:17 +0800 Subject: [PATCH 57/58] sched/rt: pick_next_rt_entity(): check list_entry ANBZ: #4281 commit 7c4a5b89a0b5a57a64b601775b296abf77a9fe97 upstream. Commit 326587b84078 ("sched: fix goto retry in pick_next_task_rt()") removed any path which could make pick_next_rt_entity() return NULL. However, BUG_ON(!rt_se) in _pick_next_task_rt() (the only caller of pick_next_rt_entity()) still checks the error condition, which can never happen, since list_entry() never returns NULL. Remove the BUG_ON check, and instead emit a warning in the only possible error condition here: the queue being empty which should never happen. Fixes: 326587b84078 ("sched: fix goto retry in pick_next_task_rt()") Signed-off-by: Pietro Borrello Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20230128-list-entry-null-check-sched-v3-1-b1a71bd1ac6b@diag.uniroma1.it Signed-off-by: Erwei Deng [Fixes conflicts] Fixes: CVE-2023-1077 Signed-off-by: hengqi.hq@alibaba-inc.com Signed-off-by: Xiao Long Reviewed-by: Cruz Zhao Reviewed-by: Artie Ding Link: https://gitee.com/anolis/cloud-kernel/pulls/1459 --- kernel/sched/rt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f5f522d5151a..53e294313ab7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1583,6 +1583,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; + if (SCHED_WARN_ON(list_empty(queue))) + return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; @@ -1596,7 +1598,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) do { rt_se = pick_next_rt_entity(rq, rt_rq); - BUG_ON(!rt_se); + if (unlikely(!rt_se)) + return NULL; rt_rq = group_rt_rq(rt_se); } while (rt_rq); -- Gitee From e095f9d69eeecad128f7534fba1050b9fd49b953 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Tue, 11 Jul 2023 10:45:18 +0800 Subject: [PATCH 58/58] netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE ANBZ: #5703 commit 1240eb93f0616b21c675416516ff3d74798fdc97 upstream. [Fixes conflicts] Fixes: CVE-2023-3117 Signed-off-by: Heng Qi Signed-off-by: Xiao Long Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1864 --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3676317e8880..bf19b384f940 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2730,7 +2730,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return 0; err2: - nf_tables_rule_release(&ctx, rule); + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_RELEASE); + nf_tables_rule_destroy(&ctx, rule); err1: for (i = 0; i < n; i++) { if (info[i].ops) { -- Gitee