From 7450cfada3594bdb15cb4e316f2dc58f5b6b109e Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:39:58 +0800
Subject: [PATCH 01/58] rds: rds_rm_zerocopy_callback() use list_first_entry()

ANBZ: #4540

commit f753a68980cf4b59a80fe677619da2b1804f526d upstream.

rds_rm_zerocopy_callback() uses list_entry() on the head of a list
causing a type confusion.
Use list_first_entry() to actually access the first element of the
rs_zcookie_queue list.

Fixes: 9426bbc6de99 ("rds: use list structure to track information for zerocopy completion notification")
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pietro Borrello <borrello@diag.uniroma1.it>
Link: https://lore.kernel.org/r/20230202-rds-zerocopy-v3-1-83b0df974f9a@diag.uniroma1.it
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-1078
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1604
---
 net/rds/message.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/rds/message.c b/net/rds/message.c
index 4b00b1152a5f..309b54cc62ae 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
 	spin_lock_irqsave(&q->lock, flags);
 	head = &q->zcookie_head;
 	if (!list_empty(head)) {
-		info = list_entry(head, struct rds_msg_zcopy_info,
-				  rs_zcookie_next);
-		if (info && rds_zcookie_add(info, cookie)) {
+		info = list_first_entry(head, struct rds_msg_zcopy_info,
+					rs_zcookie_next);
+		if (rds_zcookie_add(info, cookie)) {
 			spin_unlock_irqrestore(&q->lock, flags);
 			kfree(rds_info_from_znotifier(znotif));
 			/* caller invokes rds_wake_sk_sleep() */
-- 
Gitee


From f9fd06dad1404e060d05d53515b2016cfba23144 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:39:58 +0800
Subject: [PATCH 02/58] selinux: fix race condition when computing ocontext 
 SIDs

ANBZ: #5576

commit cbfcd13be5cb2a07868afe67520ed181956579a7 upstream.

Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Reviewed-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1773
---
 security/selinux/ss/services.c | 159 ++++++++++++++++++---------------
 1 file changed, 87 insertions(+), 72 deletions(-)

diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index f3def298a90e..12763b6c7d51 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2263,6 +2263,43 @@ size_t security_policydb_len(struct selinux_state *state)
 	return len;
 }
 
+/**
+ * ocontext_to_sid - Helper to safely get sid for an ocontext
+ * @sidtab: SID table
+ * @c: ocontext structure
+ * @index: index of the context entry (0 or 1)
+ * @out_sid: pointer to the resulting SID value
+ *
+ * For all ocontexts except OCON_ISID the SID fields are populated
+ * on-demand when needed. Since updating the SID value is an SMP-sensitive
+ * operation, this helper must be used to do that safely.
+ *
+ * WARNING: This function may return -ESTALE, indicating that the caller
+ * must retry the operation after re-acquiring the policy pointer!
+ */
+static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
+			   size_t index, u32 *out_sid)
+{
+	int rc;
+	u32 sid;
+
+	/* Ensure the associated sidtab entry is visible to this thread. */
+	sid = smp_load_acquire(&c->sid[index]);
+	if (!sid) {
+		rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
+		if (rc)
+			return rc;
+
+		/*
+		 * Ensure the new sidtab entry is visible to other threads
+		 * when they see the SID.
+		 */
+		smp_store_release(&c->sid[index], sid);
+	}
+	*out_sid = sid;
+	return 0;
+}
+
 /**
  * security_port_sid - Obtain the SID for a port.
  * @protocol: protocol number
@@ -2275,10 +2312,12 @@ int security_port_sid(struct selinux_state *state,
 	struct policydb *policydb;
 	struct sidtab *sidtab;
 	struct ocontext *c;
-	int rc = 0;
+	int rc;
 
 	read_lock(&state->ss->policy_rwlock);
 
+retry:
+	rc = 0;
 	policydb = &state->ss->policydb;
 	sidtab = &state->ss->sidtab;
 
@@ -2292,14 +2331,11 @@ int security_port_sid(struct selinux_state *state,
 	}
 
 	if (c) {
-		if (!c->sid[0]) {
-			rc = sidtab_context_to_sid(sidtab,
-						   &c->context[0],
-						   &c->sid[0]);
-			if (rc)
-				goto out;
-		}
-		*out_sid = c->sid[0];
+		rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+		if (rc == -ESTALE)
+			goto retry;
+		if (rc)
+			goto out;
 	} else {
 		*out_sid = SECINITSID_PORT;
 	}
@@ -2321,10 +2357,12 @@ int security_ib_pkey_sid(struct selinux_state *state,
 	struct policydb *policydb;
 	struct sidtab *sidtab;
 	struct ocontext *c;
-	int rc = 0;
+	int rc;
 
 	read_lock(&state->ss->policy_rwlock);
 
+retry:
+	rc = 0;
 	policydb = &state->ss->policydb;
 	sidtab = &state->ss->sidtab;
 
@@ -2339,14 +2377,11 @@ int security_ib_pkey_sid(struct selinux_state *state,
 	}
 
 	if (c) {
-		if (!c->sid[0]) {
-			rc = sidtab_context_to_sid(sidtab,
-						   &c->context[0],
-						   &c->sid[0]);
-			if (rc)
-				goto out;
-		}
-		*out_sid = c->sid[0];
+		rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+		if (rc == -ESTALE)
+			goto retry;
+		if (rc)
+			goto out;
 	} else
 		*out_sid = SECINITSID_UNLABELED;
 
@@ -2367,10 +2402,12 @@ int security_ib_endport_sid(struct selinux_state *state,
 	struct policydb *policydb;
 	struct sidtab *sidtab;
 	struct ocontext *c;
-	int rc = 0;
+	int rc;
 
 	read_lock(&state->ss->policy_rwlock);
 
+retry:
+	rc = 0;
 	policydb = &state->ss->policydb;
 	sidtab = &state->ss->sidtab;
 
@@ -2386,14 +2423,11 @@ int security_ib_endport_sid(struct selinux_state *state,
 	}
 
 	if (c) {
-		if (!c->sid[0]) {
-			rc = sidtab_context_to_sid(sidtab,
-						   &c->context[0],
-						   &c->sid[0]);
-			if (rc)
-				goto out;
-		}
-		*out_sid = c->sid[0];
+		rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+		if (rc == -ESTALE)
+			goto retry;
+		if (rc)
+			goto out;
 	} else
 		*out_sid = SECINITSID_UNLABELED;
 
@@ -2412,11 +2446,13 @@ int security_netif_sid(struct selinux_state *state,
 {
 	struct policydb *policydb;
 	struct sidtab *sidtab;
-	int rc = 0;
+	int rc;
 	struct ocontext *c;
 
 	read_lock(&state->ss->policy_rwlock);
 
+retry:
+	rc = 0;
 	policydb = &state->ss->policydb;
 	sidtab = &state->ss->sidtab;
 
@@ -2428,19 +2464,11 @@ int security_netif_sid(struct selinux_state *state,
 	}
 
 	if (c) {
-		if (!c->sid[0] || !c->sid[1]) {
-			rc = sidtab_context_to_sid(sidtab,
-						  &c->context[0],
-						  &c->sid[0]);
-			if (rc)
-				goto out;
-			rc = sidtab_context_to_sid(sidtab,
-						   &c->context[1],
-						   &c->sid[1]);
-			if (rc)
-				goto out;
-		}
-		*if_sid = c->sid[0];
+		rc = ocontext_to_sid(sidtab, c, 0, if_sid);
+		if (rc == -ESTALE)
+			goto retry;
+		if (rc)
+			goto out;
 	} else
 		*if_sid = SECINITSID_NETIF;
 
@@ -2482,6 +2510,7 @@ int security_node_sid(struct selinux_state *state,
 
 	read_lock(&state->ss->policy_rwlock);
 
+retry:
 	policydb = &state->ss->policydb;
 	sidtab = &state->ss->sidtab;
 
@@ -2524,14 +2553,11 @@ int security_node_sid(struct selinux_state *state,
 	}
 
 	if (c) {
-		if (!c->sid[0]) {
-			rc = sidtab_context_to_sid(sidtab,
-						   &c->context[0],
-						   &c->sid[0]);
-			if (rc)
-				goto out;
-		}
-		*out_sid = c->sid[0];
+		rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+		if (rc == -ESTALE)
+			goto retry;
+		if (rc)
+			goto out;
 	} else {
 		*out_sid = SECINITSID_NODE;
 	}
@@ -2690,7 +2716,7 @@ static inline int __security_genfs_sid(struct selinux_state *state,
 	u16 sclass;
 	struct genfs *genfs;
 	struct ocontext *c;
-	int rc, cmp = 0;
+	int cmp = 0;
 
 	while (path[0] == '/' && path[1] == '/')
 		path++;
@@ -2704,9 +2730,8 @@ static inline int __security_genfs_sid(struct selinux_state *state,
 			break;
 	}
 
-	rc = -ENOENT;
 	if (!genfs || cmp)
-		goto out;
+		return -ENOENT;
 
 	for (c = genfs->head; c; c = c->next) {
 		len = strlen(c->u.name);
@@ -2715,20 +2740,10 @@ static inline int __security_genfs_sid(struct selinux_state *state,
 			break;
 	}
 
-	rc = -ENOENT;
 	if (!c)
-		goto out;
-
-	if (!c->sid[0]) {
-		rc = sidtab_context_to_sid(sidtab, &c->context[0], &c->sid[0]);
-		if (rc)
-			goto out;
-	}
+		return -ENOENT;
 
-	*sid = c->sid[0];
-	rc = 0;
-out:
-	return rc;
+	return ocontext_to_sid(sidtab, c, 0, sid);
 }
 
 /**
@@ -2763,13 +2778,15 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
 {
 	struct policydb *policydb;
 	struct sidtab *sidtab;
-	int rc = 0;
+	int rc;
 	struct ocontext *c;
 	struct superblock_security_struct *sbsec = sb->s_security;
 	const char *fstype = sb->s_type->name;
 
 	read_lock(&state->ss->policy_rwlock);
 
+retry:
+	rc = 0;
 	policydb = &state->ss->policydb;
 	sidtab = &state->ss->sidtab;
 
@@ -2782,13 +2799,11 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
 
 	if (c) {
 		sbsec->behavior = c->v.behavior;
-		if (!c->sid[0]) {
-			rc = sidtab_context_to_sid(sidtab, &c->context[0],
-						   &c->sid[0]);
-			if (rc)
-				goto out;
-		}
-		sbsec->sid = c->sid[0];
+		rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
+		if (rc == -ESTALE)
+			goto retry;
+		if (rc)
+			goto out;
 	} else {
 		rc = __security_genfs_sid(state, fstype, "/", SECCLASS_DIR,
 					  &sbsec->sid);
-- 
Gitee


From 62fa27cd3b56870ea07a0c5cb1171c5568ccbe1c Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:39:59 +0800
Subject: [PATCH 03/58] fs: hfsplus: fix UAF issue in hfsplus_put_super

ANBZ: #5381

commit 07db5e247ab5858439b14dd7cc1fe538b9efcf32 upstream.

The current hfsplus_put_super first calls hfs_btree_close on
sbi->ext_tree, then invokes iput on sbi->hidden_dir, resulting in an
use-after-free issue in hfsplus_release_folio.

As shown in hfsplus_fill_super, the error handling code also calls iput
before hfs_btree_close.

To fix this error, we move all iput calls before hfsplus_btree_close.

Note that this patch is tested on Syzbot.

Link: https://lkml.kernel.org/r/20230226124948.3175736-1-mudongliangabcd@gmail.com
Reported-by: syzbot+57e3e98f7e3b80f64d56@syzkaller.appspotmail.com
Tested-by: Dongliang Mu <mudongliangabcd@gmail.com>
Signed-off-by: Dongliang Mu <mudongliangabcd@gmail.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Fixes: CVE-2023-2985
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1777
---
 fs/hfsplus/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb4535eba95d..3b1356b10a47 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -294,11 +294,11 @@ static void hfsplus_put_super(struct super_block *sb)
 		hfsplus_sync_fs(sb, 1);
 	}
 
+	iput(sbi->alloc_file);
+	iput(sbi->hidden_dir);
 	hfs_btree_close(sbi->attr_tree);
 	hfs_btree_close(sbi->cat_tree);
 	hfs_btree_close(sbi->ext_tree);
-	iput(sbi->alloc_file);
-	iput(sbi->hidden_dir);
 	kfree(sbi->s_vhdr_buf);
 	kfree(sbi->s_backup_vhdr_buf);
 	unload_nls(sbi->nls);
-- 
Gitee


From 13ff6e1dcb2e111aea2a3b3b75e2f7a41dd58ad3 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:41:48 +0800
Subject: [PATCH 04/58] ntfs: fix use-after-free in ntfs_ucsncmp()

ANBZ: #5368

commit 38c9c22a85aeed28d0831f230136e9cf6fa2ed44 upstream.

Syzkaller reported use-after-free bug as follows:

==================================================================
BUG: KASAN: use-after-free in ntfs_ucsncmp+0x123/0x130
Read of size 2 at addr ffff8880751acee8 by task a.out/879

CPU: 7 PID: 879 Comm: a.out Not tainted 5.19.0-rc4-next-20220630-00001-gcc5218c8bd2c-dirty #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x1c0/0x2b0
 print_address_description.constprop.0.cold+0xd4/0x484
 print_report.cold+0x55/0x232
 kasan_report+0xbf/0xf0
 ntfs_ucsncmp+0x123/0x130
 ntfs_are_names_equal.cold+0x2b/0x41
 ntfs_attr_find+0x43b/0xb90
 ntfs_attr_lookup+0x16d/0x1e0
 ntfs_read_locked_attr_inode+0x4aa/0x2360
 ntfs_attr_iget+0x1af/0x220
 ntfs_read_locked_inode+0x246c/0x5120
 ntfs_iget+0x132/0x180
 load_system_files+0x1cc6/0x3480
 ntfs_fill_super+0xa66/0x1cf0
 mount_bdev+0x38d/0x460
 legacy_get_tree+0x10d/0x220
 vfs_get_tree+0x93/0x300
 do_new_mount+0x2da/0x6d0
 path_mount+0x496/0x19d0
 __x64_sys_mount+0x284/0x300
 do_syscall_64+0x3b/0xc0
 entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7f3f2118d9ea
Code: 48 8b 0d a9 f4 0b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 76 f4 0b 00 f7 d8 64 89 01 48
RSP: 002b:00007ffc269deac8 EFLAGS: 00000202 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3f2118d9ea
RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffc269dec00
RBP: 00007ffc269dec80 R08: 00007ffc269deb00 R09: 00007ffc269dec44
R10: 0000000000000000 R11: 0000000000000202 R12: 000055f81ab1d220
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 </TASK>

The buggy address belongs to the physical page:
page:0000000085430378 refcount:1 mapcount:1 mapping:0000000000000000 index:0x555c6a81d pfn:0x751ac
memcg:ffff888101f7e180
anon flags: 0xfffffc00a0014(uptodate|lru|mappedtodisk|swapbacked|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc00a0014 ffffea0001bf2988 ffffea0001de2448 ffff88801712e201
raw: 0000000555c6a81d 0000000000000000 0000000100000000 ffff888101f7e180
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8880751acd80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff8880751ace00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ffff8880751ace80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
                                                          ^
 ffff8880751acf00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff8880751acf80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================

The reason is that struct ATTR_RECORD->name_offset is 6485, end address of
name string is out of bounds.

Fix this by adding sanity check on end address of attribute name string.

[akpm@linux-foundation.org: coding-style cleanups]
[chenxiaosong2@huawei.com: cleanup suggested by Hawkins Jiawei]
  Link: https://lkml.kernel.org/r/20220709064511.3304299-1-chenxiaosong2@huawei.com
Link: https://lkml.kernel.org/r/20220707105329.4020708-1-chenxiaosong2@huawei.com
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: ChenXiaoSong <chenxiaosong2@huawei.com>
Cc: Yongqiang Liu <liuyongqiang13@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1779
---
 fs/ntfs/attrib.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 44a39a099b54..62b49197e5f6 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -606,8 +606,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
 		a = (ATTR_RECORD*)((u8*)ctx->attr +
 				le32_to_cpu(ctx->attr->length));
 	for (;;	a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
-		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
-				le32_to_cpu(ctx->mrec->bytes_allocated))
+		u8 *mrec_end = (u8 *)ctx->mrec +
+		               le32_to_cpu(ctx->mrec->bytes_allocated);
+		u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+			       a->name_length * sizeof(ntfschar);
+		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
+		    name_end > mrec_end)
 			break;
 		ctx->attr = a;
 		if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
-- 
Gitee


From 425755e268c246d7c30268cce192880b8d742998 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:41:49 +0800
Subject: [PATCH 05/58] ntfs: fix out-of-bounds read in ntfs_attr_find()

ANBZ: #5368

commit 36a4d82dddbbd421d2b8e79e1cab68c8126d5075 upstream.

Kernel iterates over ATTR_RECORDs in mft record in ntfs_attr_find().  To
ensure access on these ATTR_RECORDs are within bounds, kernel will do some
checking during iteration.

The problem is that during checking whether ATTR_RECORD's name is within
bounds, kernel will dereferences the ATTR_RECORD name_offset field, before
checking this ATTR_RECORD strcture is within bounds.  This problem may
result out-of-bounds read in ntfs_attr_find(), reported by Syzkaller:

==================================================================
BUG: KASAN: use-after-free in ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597
Read of size 2 at addr ffff88807e352009 by task syz-executor153/3607

[...]
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 print_address_description mm/kasan/report.c:317 [inline]
 print_report.cold+0x2ba/0x719 mm/kasan/report.c:433
 kasan_report+0xb1/0x1e0 mm/kasan/report.c:495
 ntfs_attr_find+0xc02/0xce0 fs/ntfs/attrib.c:597
 ntfs_attr_lookup+0x1056/0x2070 fs/ntfs/attrib.c:1193
 ntfs_read_inode_mount+0x89a/0x2580 fs/ntfs/inode.c:1845
 ntfs_fill_super+0x1799/0x9320 fs/ntfs/super.c:2854
 mount_bdev+0x34d/0x410 fs/super.c:1400
 legacy_get_tree+0x105/0x220 fs/fs_context.c:610
 vfs_get_tree+0x89/0x2f0 fs/super.c:1530
 do_new_mount fs/namespace.c:3040 [inline]
 path_mount+0x1326/0x1e20 fs/namespace.c:3370
 do_mount fs/namespace.c:3383 [inline]
 __do_sys_mount fs/namespace.c:3591 [inline]
 __se_sys_mount fs/namespace.c:3568 [inline]
 __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
 [...]
 </TASK>

The buggy address belongs to the physical page:
page:ffffea0001f8d400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7e350
head:ffffea0001f8d400 order:3 compound_mapcount:0 compound_pincount:0
flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888011842140
raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
 ffff88807e351f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff88807e351f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88807e352000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                      ^
 ffff88807e352080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88807e352100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

This patch solves it by moving the ATTR_RECORD strcture's bounds checking
earlier, then checking whether ATTR_RECORD's name is within bounds.
What's more, this patch also add some comments to improve its
maintainability.

Link: https://lkml.kernel.org/r/20220831160935.3409-3-yin31149@gmail.com
Link: https://lore.kernel.org/all/1636796c-c85e-7f47-e96f-e074fee3c7d3@huawei.com/
Link: https://groups.google.com/g/syzkaller-bugs/c/t_XdeKPGTR4/m/LECAuIGcBgAJ
Signed-off-by: chenxiaosong (A) <chenxiaosong2@huawei.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Reported-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com
Tested-by: syzbot+5f8dcabe4a3b2c51c607@syzkaller.appspotmail.com
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: syzkaller-bugs <syzkaller-bugs@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fixes: CVE-2023-26607
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1779
---
 fs/ntfs/attrib.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 62b49197e5f6..122a9b45b350 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -608,11 +608,23 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
 	for (;;	a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
 		u8 *mrec_end = (u8 *)ctx->mrec +
 		               le32_to_cpu(ctx->mrec->bytes_allocated);
-		u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
-			       a->name_length * sizeof(ntfschar);
-		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
-		    name_end > mrec_end)
+		u8 *name_end;
+
+		/* check whether ATTR_RECORD wrap */
+		if ((u8 *)a < (u8 *)ctx->mrec)
+			break;
+
+		/* check whether Attribute Record Header is within bounds */
+		if ((u8 *)a > mrec_end ||
+		    (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
 			break;
+
+		/* check whether ATTR_RECORD's name is within bounds */
+		name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+			   a->name_length * sizeof(ntfschar);
+		if (name_end > mrec_end)
+			break;
+
 		ctx->attr = a;
 		if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
 				a->type == AT_END))
-- 
Gitee


From 184f3bf1fb6688e0d1552c31b59caf25c333edff Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:41:50 +0800
Subject: [PATCH 06/58] net/sched: Retire tcindex classifier

ANBZ: #4784

commit 8c710f75256bb3cf05ac7b1672c82b92c43f3d28 upstream.

commit 8c710f75256bb3cf05ac7b1672c82b92c43f3d28 upstream.

The tcindex classifier has served us well for about a quarter of a century
but has not been getting much TLC due to lack of known users. Most recently
it has become easy prey to syzkaller. For this reason, we are retiring it.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Qiao Ma <mqaio@linux.alibaba.com>

[Fixes conflicts]
Fixes: CVE-2023-1829
Signed-off-by: None <>
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1771
---
 net/sched/Kconfig       |  11 -
 net/sched/Makefile      |   1 -
 net/sched/cls_tcindex.c | 715 ----------------------------------------
 3 files changed, 727 deletions(-)
 delete mode 100644 net/sched/cls_tcindex.c

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e95741388311..4547022ed7f4 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -458,17 +458,6 @@ config NET_CLS_BASIC
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_basic.
 
-config NET_CLS_TCINDEX
-	tristate "Traffic-Control Index (TCINDEX)"
-	select NET_CLS
-	---help---
-	  Say Y here if you want to be able to classify packets based on
-	  traffic control indices. You will want this feature if you want
-	  to implement Differentiated Services together with DSMARK.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called cls_tcindex.
-
 config NET_CLS_ROUTE4
 	tristate "Routing decision (ROUTE)"
 	depends on INET
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f0403f49edcb..5eed580cdb42 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -62,7 +62,6 @@ obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
 obj-$(CONFIG_NET_CLS_RSVP)	+= cls_rsvp.o
-obj-$(CONFIG_NET_CLS_TCINDEX)	+= cls_tcindex.o
 obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
 obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
deleted file mode 100644
index 7735404da572..000000000000
--- a/net/sched/cls_tcindex.c
+++ /dev/null
@@ -1,715 +0,0 @@
-/*
- * net/sched/cls_tcindex.c	Packet classifier for skb->tc_index
- *
- * Written 1998,1999 by Werner Almesberger, EPFL ICA
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-#include <net/act_api.h>
-#include <net/netlink.h>
-#include <net/pkt_cls.h>
-#include <net/sch_generic.h>
-
-/*
- * Passing parameters to the root seems to be done more awkwardly than really
- * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
- * verified. FIXME.
- */
-
-#define PERFECT_HASH_THRESHOLD	64	/* use perfect hash if not bigger */
-#define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */
-
-
-struct tcindex_filter_result {
-	struct tcf_exts		exts;
-	struct tcf_result	res;
-	struct rcu_work		rwork;
-};
-
-struct tcindex_filter {
-	u16 key;
-	struct tcindex_filter_result result;
-	struct tcindex_filter __rcu *next;
-	struct rcu_work rwork;
-};
-
-
-struct tcindex_data {
-	struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
-	struct tcindex_filter __rcu **h; /* imperfect hash; */
-	struct tcf_proto *tp;
-	u16 mask;		/* AND key with mask */
-	u32 shift;		/* shift ANDed key to the right */
-	u32 hash;		/* hash table size; 0 if undefined */
-	u32 alloc_hash;		/* allocated size */
-	u32 fall_through;	/* 0: only classify if explicit match */
-	struct rcu_work rwork;
-};
-
-static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
-{
-	return tcf_exts_has_actions(&r->exts) || r->res.classid;
-}
-
-static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
-						    u16 key)
-{
-	if (p->perfect) {
-		struct tcindex_filter_result *f = p->perfect + key;
-
-		return tcindex_filter_is_set(f) ? f : NULL;
-	} else if (p->h) {
-		struct tcindex_filter __rcu **fp;
-		struct tcindex_filter *f;
-
-		fp = &p->h[key % p->hash];
-		for (f = rcu_dereference_bh_rtnl(*fp);
-		     f;
-		     fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
-			if (f->key == key)
-				return &f->result;
-	}
-
-	return NULL;
-}
-
-
-static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-			    struct tcf_result *res)
-{
-	struct tcindex_data *p = rcu_dereference_bh(tp->root);
-	struct tcindex_filter_result *f;
-	int key = (skb->tc_index & p->mask) >> p->shift;
-
-	pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
-		 skb, tp, res, p);
-
-	f = tcindex_lookup(p, key);
-	if (!f) {
-		struct Qdisc *q = tcf_block_q(tp->chain->block);
-
-		if (!p->fall_through)
-			return -1;
-		res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key);
-		res->class = 0;
-		pr_debug("alg 0x%x\n", res->classid);
-		return 0;
-	}
-	*res = f->res;
-	pr_debug("map 0x%x\n", res->classid);
-
-	return tcf_exts_exec(skb, &f->exts, res);
-}
-
-
-static void *tcindex_get(struct tcf_proto *tp, u32 handle)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r;
-
-	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
-	if (p->perfect && handle >= p->alloc_hash)
-		return NULL;
-	r = tcindex_lookup(p, handle);
-	return r && tcindex_filter_is_set(r) ? r : NULL;
-}
-
-static int tcindex_init(struct tcf_proto *tp)
-{
-	struct tcindex_data *p;
-
-	pr_debug("tcindex_init(tp %p)\n", tp);
-	p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
-	if (!p)
-		return -ENOMEM;
-
-	p->mask = 0xffff;
-	p->hash = DEFAULT_HASH_SIZE;
-	p->fall_through = 1;
-
-	rcu_assign_pointer(tp->root, p);
-	return 0;
-}
-
-static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
-{
-	tcf_exts_destroy(&r->exts);
-	tcf_exts_put_net(&r->exts);
-}
-
-static void tcindex_destroy_rexts_work(struct work_struct *work)
-{
-	struct tcindex_filter_result *r;
-
-	r = container_of(to_rcu_work(work),
-			 struct tcindex_filter_result,
-			 rwork);
-	rtnl_lock();
-	__tcindex_destroy_rexts(r);
-	rtnl_unlock();
-}
-
-static void __tcindex_destroy_fexts(struct tcindex_filter *f)
-{
-	tcf_exts_destroy(&f->result.exts);
-	tcf_exts_put_net(&f->result.exts);
-	kfree(f);
-}
-
-static void tcindex_destroy_fexts_work(struct work_struct *work)
-{
-	struct tcindex_filter *f = container_of(to_rcu_work(work),
-						struct tcindex_filter,
-						rwork);
-
-	rtnl_lock();
-	__tcindex_destroy_fexts(f);
-	rtnl_unlock();
-}
-
-static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
-			  struct netlink_ext_ack *extack)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = arg;
-	struct tcindex_filter __rcu **walk;
-	struct tcindex_filter *f = NULL;
-
-	pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p);
-	if (p->perfect) {
-		if (!r->res.class)
-			return -ENOENT;
-	} else {
-		int i;
-
-		for (i = 0; i < p->hash; i++) {
-			walk = p->h + i;
-			for (f = rtnl_dereference(*walk); f;
-			     walk = &f->next, f = rtnl_dereference(*walk)) {
-				if (&f->result == r)
-					goto found;
-			}
-		}
-		return -ENOENT;
-
-found:
-		rcu_assign_pointer(*walk, rtnl_dereference(f->next));
-	}
-	tcf_unbind_filter(tp, &r->res);
-	/* all classifiers are required to call tcf_exts_destroy() after rcu
-	 * grace period, since converted-to-rcu actions are relying on that
-	 * in cleanup() callback
-	 */
-	if (f) {
-		if (tcf_exts_get_net(&f->result.exts))
-			tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
-		else
-			__tcindex_destroy_fexts(f);
-	} else {
-		if (tcf_exts_get_net(&r->exts))
-			tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
-		else
-			__tcindex_destroy_rexts(r);
-	}
-
-	*last = false;
-	return 0;
-}
-
-static void tcindex_destroy_work(struct work_struct *work)
-{
-	struct tcindex_data *p = container_of(to_rcu_work(work),
-					      struct tcindex_data,
-					      rwork);
-
-	kfree(p->perfect);
-	kfree(p->h);
-	kfree(p);
-}
-
-static inline int
-valid_perfect_hash(struct tcindex_data *p)
-{
-	return  p->hash > (p->mask >> p->shift);
-}
-
-static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
-	[TCA_TCINDEX_HASH]		= { .type = NLA_U32 },
-	[TCA_TCINDEX_MASK]		= { .type = NLA_U16 },
-	[TCA_TCINDEX_SHIFT]		= { .type = NLA_U32 },
-	[TCA_TCINDEX_FALL_THROUGH]	= { .type = NLA_U32 },
-	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },
-};
-
-static int tcindex_filter_result_init(struct tcindex_filter_result *r)
-{
-	memset(r, 0, sizeof(*r));
-	return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
-}
-
-static void tcindex_partial_destroy_work(struct work_struct *work)
-{
-	struct tcindex_data *p = container_of(to_rcu_work(work),
-					      struct tcindex_data,
-					      rwork);
-
-	kfree(p->perfect);
-	kfree(p);
-}
-
-static void tcindex_free_perfect_hash(struct tcindex_data *cp)
-{
-	int i;
-
-	for (i = 0; i < cp->hash; i++)
-		tcf_exts_destroy(&cp->perfect[i].exts);
-	kfree(cp->perfect);
-}
-
-static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
-{
-	int i, err = 0;
-
-	cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result),
-			      GFP_KERNEL);
-	if (!cp->perfect)
-		return -ENOMEM;
-
-	for (i = 0; i < cp->hash; i++) {
-		err = tcf_exts_init(&cp->perfect[i].exts,
-				    TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
-		if (err < 0)
-			goto errout;
-#ifdef CONFIG_NET_CLS_ACT
-		cp->perfect[i].exts.net = net;
-#endif
-	}
-
-	return 0;
-
-errout:
-	tcindex_free_perfect_hash(cp);
-	return err;
-}
-
-static int
-tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
-		  u32 handle, struct tcindex_data *p,
-		  struct tcindex_filter_result *r, struct nlattr **tb,
-		  struct nlattr *est, bool ovr, struct netlink_ext_ack *extack)
-{
-	struct tcindex_filter_result new_filter_result, *old_r = r;
-	struct tcindex_data *cp = NULL, *oldp;
-	struct tcindex_filter *f = NULL; /* make gcc behave */
-	struct tcf_result cr = {};
-	int err, balloc = 0;
-	struct tcf_exts e;
-	bool update_h = false;
-
-	err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
-	if (err < 0)
-		return err;
-	err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack);
-	if (err < 0)
-		goto errout;
-
-	err = -ENOMEM;
-	/* tcindex_data attributes must look atomic to classifier/lookup so
-	 * allocate new tcindex data and RCU assign it onto root. Keeping
-	 * perfect hash and hash pointers from old data.
-	 */
-	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
-	if (!cp)
-		goto errout;
-
-	cp->mask = p->mask;
-	cp->shift = p->shift;
-	cp->hash = p->hash;
-	cp->alloc_hash = p->alloc_hash;
-	cp->fall_through = p->fall_through;
-	cp->tp = tp;
-
-	if (p->perfect) {
-		int i;
-
-		if (tcindex_alloc_perfect_hash(net, cp) < 0)
-			goto errout;
-		for (i = 0; i < cp->hash; i++)
-			cp->perfect[i].res = p->perfect[i].res;
-		balloc = 1;
-	}
-	cp->h = p->h;
-
-	err = tcindex_filter_result_init(&new_filter_result);
-	if (err < 0)
-		goto errout1;
-	if (old_r)
-		cr = r->res;
-
-	if (tb[TCA_TCINDEX_HASH])
-		cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
-
-	if (tb[TCA_TCINDEX_MASK])
-		cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
-
-	if (tb[TCA_TCINDEX_SHIFT])
-		cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
-
-	err = -EBUSY;
-
-	/* Hash already allocated, make sure that we still meet the
-	 * requirements for the allocated hash.
-	 */
-	if (cp->perfect) {
-		if (!valid_perfect_hash(cp) ||
-		    cp->hash > cp->alloc_hash)
-			goto errout_alloc;
-	} else if (cp->h && cp->hash != cp->alloc_hash) {
-		goto errout_alloc;
-	}
-
-	err = -EINVAL;
-	if (tb[TCA_TCINDEX_FALL_THROUGH])
-		cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
-
-	if (!cp->hash) {
-		/* Hash not specified, use perfect hash if the upper limit
-		 * of the hashing index is below the threshold.
-		 */
-		if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
-			cp->hash = (cp->mask >> cp->shift) + 1;
-		else
-			cp->hash = DEFAULT_HASH_SIZE;
-	}
-
-	if (!cp->perfect && !cp->h)
-		cp->alloc_hash = cp->hash;
-
-	/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
-	 * but then, we'd fail handles that may become valid after some future
-	 * mask change. While this is extremely unlikely to ever matter,
-	 * the check below is safer (and also more backwards-compatible).
-	 */
-	if (cp->perfect || valid_perfect_hash(cp))
-		if (handle >= cp->alloc_hash)
-			goto errout_alloc;
-
-
-	err = -ENOMEM;
-	if (!cp->perfect && !cp->h) {
-		if (valid_perfect_hash(cp)) {
-			if (tcindex_alloc_perfect_hash(net, cp) < 0)
-				goto errout_alloc;
-			balloc = 1;
-		} else {
-			struct tcindex_filter __rcu **hash;
-
-			hash = kcalloc(cp->hash,
-				       sizeof(struct tcindex_filter *),
-				       GFP_KERNEL);
-
-			if (!hash)
-				goto errout_alloc;
-
-			cp->h = hash;
-			balloc = 2;
-		}
-	}
-
-	if (cp->perfect) {
-		r = cp->perfect + handle;
-	} else {
-		/* imperfect area is updated in-place using rcu */
-		update_h = !!tcindex_lookup(cp, handle);
-		r = &new_filter_result;
-	}
-
-	if (r == &new_filter_result) {
-		f = kzalloc(sizeof(*f), GFP_KERNEL);
-		if (!f)
-			goto errout_alloc;
-		f->key = handle;
-		f->next = NULL;
-		err = tcindex_filter_result_init(&f->result);
-		if (err < 0) {
-			kfree(f);
-			goto errout_alloc;
-		}
-	}
-
-	if (tb[TCA_TCINDEX_CLASSID]) {
-		cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
-		tcf_bind_filter(tp, &cr, base);
-	}
-
-	if (old_r && old_r != r) {
-		err = tcindex_filter_result_init(old_r);
-		if (err < 0) {
-			kfree(f);
-			goto errout_alloc;
-		}
-	}
-
-	oldp = p;
-	r->res = cr;
-	tcf_exts_change(&r->exts, &e);
-
-	rcu_assign_pointer(tp->root, cp);
-
-	if (update_h) {
-		struct tcindex_filter __rcu **fp;
-		struct tcindex_filter *cf;
-
-		f->result.res = r->res;
-		tcf_exts_change(&f->result.exts, &r->exts);
-
-		/* imperfect area bucket */
-		fp = cp->h + (handle % cp->hash);
-
-		/* lookup the filter, guaranteed to exist */
-		for (cf = rcu_dereference_bh_rtnl(*fp); cf;
-		     fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp))
-			if (cf->key == handle)
-				break;
-
-		f->next = cf->next;
-
-		cf = rcu_replace_pointer(*fp, f, 1);
-		tcf_exts_get_net(&cf->result.exts);
-		tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work);
-	} else if (r == &new_filter_result) {
-		struct tcindex_filter *nfp;
-		struct tcindex_filter __rcu **fp;
-
-		f->result.res = r->res;
-		tcf_exts_change(&f->result.exts, &r->exts);
-
-		fp = cp->h + (handle % cp->hash);
-		for (nfp = rtnl_dereference(*fp);
-		     nfp;
-		     fp = &nfp->next, nfp = rtnl_dereference(*fp))
-				; /* nothing */
-
-		rcu_assign_pointer(*fp, f);
-	} else {
-		tcf_exts_destroy(&new_filter_result.exts);
-	}
-
-	if (oldp)
-		tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work);
-	return 0;
-
-errout_alloc:
-	if (balloc == 1)
-		tcindex_free_perfect_hash(cp);
-	else if (balloc == 2)
-		kfree(cp->h);
-errout1:
-	tcf_exts_destroy(&new_filter_result.exts);
-errout:
-	kfree(cp);
-	tcf_exts_destroy(&e);
-	return err;
-}
-
-static int
-tcindex_change(struct net *net, struct sk_buff *in_skb,
-	       struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct nlattr **tca, void **arg, bool ovr,
-	       struct netlink_ext_ack *extack)
-{
-	struct nlattr *opt = tca[TCA_OPTIONS];
-	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = *arg;
-	int err;
-
-	pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
-	    "p %p,r %p,*arg %p\n",
-	    tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL);
-
-	if (!opt)
-		return 0;
-
-	err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy, NULL);
-	if (err < 0)
-		return err;
-
-	return tcindex_set_parms(net, tp, base, handle, p, r, tb,
-				 tca[TCA_RATE], ovr, extack);
-}
-
-static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter *f, *next;
-	int i;
-
-	pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
-	if (p->perfect) {
-		for (i = 0; i < p->hash; i++) {
-			if (!p->perfect[i].res.class)
-				continue;
-			if (walker->count >= walker->skip) {
-				if (walker->fn(tp, p->perfect + i, walker) < 0) {
-					walker->stop = 1;
-					return;
-				}
-			}
-			walker->count++;
-		}
-	}
-	if (!p->h)
-		return;
-	for (i = 0; i < p->hash; i++) {
-		for (f = rtnl_dereference(p->h[i]); f; f = next) {
-			next = rtnl_dereference(f->next);
-			if (walker->count >= walker->skip) {
-				if (walker->fn(tp, &f->result, walker) < 0) {
-					walker->stop = 1;
-					return;
-				}
-			}
-			walker->count++;
-		}
-	}
-}
-
-static void tcindex_destroy(struct tcf_proto *tp,
-			    struct netlink_ext_ack *extack)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	int i;
-
-	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
-
-	if (p->perfect) {
-		for (i = 0; i < p->hash; i++) {
-			struct tcindex_filter_result *r = p->perfect + i;
-
-			tcf_unbind_filter(tp, &r->res);
-			if (tcf_exts_get_net(&r->exts))
-				tcf_queue_work(&r->rwork,
-					       tcindex_destroy_rexts_work);
-			else
-				__tcindex_destroy_rexts(r);
-		}
-	}
-
-	for (i = 0; p->h && i < p->hash; i++) {
-		struct tcindex_filter *f, *next;
-		bool last;
-
-		for (f = rtnl_dereference(p->h[i]); f; f = next) {
-			next = rtnl_dereference(f->next);
-			tcindex_delete(tp, &f->result, &last, NULL);
-		}
-	}
-
-	tcf_queue_work(&p->rwork, tcindex_destroy_work);
-}
-
-
-static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh,
-			struct sk_buff *skb, struct tcmsg *t)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = fh;
-	struct nlattr *nest;
-
-	pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n",
-		 tp, fh, skb, t, p, r);
-	pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
-
-	nest = nla_nest_start(skb, TCA_OPTIONS);
-	if (nest == NULL)
-		goto nla_put_failure;
-
-	if (!fh) {
-		t->tcm_handle = ~0; /* whatever ... */
-		if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
-		    nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
-		    nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
-		    nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
-			goto nla_put_failure;
-		nla_nest_end(skb, nest);
-	} else {
-		if (p->perfect) {
-			t->tcm_handle = r - p->perfect;
-		} else {
-			struct tcindex_filter *f;
-			struct tcindex_filter __rcu **fp;
-			int i;
-
-			t->tcm_handle = 0;
-			for (i = 0; !t->tcm_handle && i < p->hash; i++) {
-				fp = &p->h[i];
-				for (f = rtnl_dereference(*fp);
-				     !t->tcm_handle && f;
-				     fp = &f->next, f = rtnl_dereference(*fp)) {
-					if (&f->result == r)
-						t->tcm_handle = f->key;
-				}
-			}
-		}
-		pr_debug("handle = %d\n", t->tcm_handle);
-		if (r->res.class &&
-		    nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
-			goto nla_put_failure;
-
-		if (tcf_exts_dump(skb, &r->exts) < 0)
-			goto nla_put_failure;
-		nla_nest_end(skb, nest);
-
-		if (tcf_exts_dump_stats(skb, &r->exts) < 0)
-			goto nla_put_failure;
-	}
-
-	return skb->len;
-
-nla_put_failure:
-	nla_nest_cancel(skb, nest);
-	return -1;
-}
-
-static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl)
-{
-	struct tcindex_filter_result *r = fh;
-
-	if (r && r->res.classid == classid)
-		r->res.class = cl;
-}
-
-static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
-	.kind		=	"tcindex",
-	.classify	=	tcindex_classify,
-	.init		=	tcindex_init,
-	.destroy	=	tcindex_destroy,
-	.get		=	tcindex_get,
-	.change		=	tcindex_change,
-	.delete		=	tcindex_delete,
-	.walk		=	tcindex_walk,
-	.dump		=	tcindex_dump,
-	.bind_class	=	tcindex_bind_class,
-	.owner		=	THIS_MODULE,
-};
-
-static int __init init_tcindex(void)
-{
-	return register_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-static void __exit exit_tcindex(void)
-{
-	unregister_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-module_init(init_tcindex)
-module_exit(exit_tcindex)
-MODULE_LICENSE("GPL");
-- 
Gitee


From 505750721c37515943dbdc239c9d609533126337 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:41:51 +0800
Subject: [PATCH 07/58] kthread: Switch to cpu_possible_mask

ANBZ: #5581

commit 043eb8e1051143a24811e6f35c276e35ae8247b6 upstream.

Next patch will switch unbound kernel threads mask to
housekeeping_cpumask(), a subset of cpu_possible_mask. So in order to
ease bisection, lets first switch kthreads default affinity from
cpu_all_mask to cpu_possible_mask.

It looks safe to do so as cpu_possible_mask seem to be initialized
at setup_arch() time, way before kthreadd is created.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200527142909.23372-2-frederic@kernel.org
Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
Reviewed-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Artie Ding <artie.ding@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1775
---
 kernel/kthread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index fcb3a1a6e14b..5ddea08da19e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -339,7 +339,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 * The kernel thread should not inherit these properties.
 		 */
 		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task, cpu_all_mask);
+		set_cpus_allowed_ptr(task, cpu_possible_mask);
 	}
 	kfree(create);
 	return task;
@@ -564,7 +564,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_cpus_allowed_ptr(tsk, cpu_all_mask);
+	set_cpus_allowed_ptr(tsk, cpu_possible_mask);
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
-- 
Gitee


From 311ced57b84347bcbc79996bd1f75fb21995b613 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:41:52 +0800
Subject: [PATCH 08/58] isolcpus: Affine unbound kernel threads to housekeeping
  cpus

ANBZ: #5581

commit 9cc5b8656892a72438ee7deb5e80f5be47643b8b upstream.

This is a kernel enhancement that configures the cpu affinity of kernel
threads via kernel boot option nohz_full=.

When this option is specified, the cpumask is immediately applied upon
kthread launch. This does not affect kernel threads that specify cpu
and node.

This allows CPU isolation (that is not allowing certain threads
to execute on certain CPUs) without using the isolcpus=domain parameter,
making it possible to enable load balancing on such CPUs
during runtime (see kernel-parameters.txt).

Note-1: this is based off on Wind River's patch at
https://github.com/starlingx-staging/stx-integ/blob/master/kernel/kernel-std/centos/patches/affine-compute-kernel-threads.patch

Difference being that this patch is limited to modifying kernel thread
cpumask. Behaviour of other threads can be controlled via cgroups or
sched_setaffinity.

Note-2: Wind River's patch was based off Christoph Lameter's patch at
https://lwn.net/Articles/565932/ with the only difference being
the kernel parameter changed from kthread to kthread_cpus.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200527142909.23372-3-frederic@kernel.org
Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
Reviewed-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Artie Ding <artie.ding@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1775
---
 include/linux/sched/isolation.h | 1 +
 kernel/kthread.c                | 6 ++++--
 kernel/sched/isolation.c        | 3 ++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 30c84ba31a52..6d10cbc6d605 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -13,6 +13,7 @@ enum hk_flags {
 	HK_FLAG_TICK		= (1 << 4),
 	HK_FLAG_DOMAIN		= (1 << 5),
 	HK_FLAG_WQ		= (1 << 6),
+	HK_FLAG_KTHREAD		= (1 << 8),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5ddea08da19e..d18066d51bca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/sched/isolation.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -339,7 +340,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 * The kernel thread should not inherit these properties.
 		 */
 		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task, cpu_possible_mask);
+		set_cpus_allowed_ptr(task,
+				     housekeeping_cpumask(HK_FLAG_KTHREAD));
 	}
 	kfree(create);
 	return task;
@@ -564,7 +566,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_cpus_allowed_ptr(tsk, cpu_possible_mask);
+	set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index da0ac89cc382..0409a879ae5f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -174,7 +174,8 @@ static int __init housekeeping_nohz_full_setup(char *str)
 {
 	unsigned int flags;
 
-	flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+	flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
+		HK_FLAG_MISC | HK_FLAG_KTHREAD;
 
 	return housekeeping_setup(str, flags);
 }
-- 
Gitee


From a7b619149c4a5fd11df70f2de50cd79eddee06b7 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:41:53 +0800
Subject: [PATCH 09/58] net/sched: flower: fix possible OOB write in 
 fl_set_geneve_opt()

ANBZ: #5616

commit 4d56304e5827c8cc8cc18c75343d283af7c4825c upstream.

If we send two TCA_FLOWER_KEY_ENC_OPTS_GENEVE packets and their total
size is 252 bytes(key->enc_opts.len = 252) then
key->enc_opts.len = opt->length = data_len / 4 = 0 when the third
TCA_FLOWER_KEY_ENC_OPTS_GENEVE packet enters fl_set_geneve_opt. This
bypasses the next bounds check and results in an out-of-bounds.

Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options")
Signed-off-by: Hangyu Hua <hbh25y@gmail.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Link: https://lore.kernel.org/r/20230531102805.27090-1-hbh25y@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-35788
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1786
---
 net/sched/cls_flower.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 09b359784629..789517fde758 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -632,6 +632,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
 	if (option_len > sizeof(struct geneve_opt))
 		data_len = option_len - sizeof(struct geneve_opt);
 
+	if (key->enc_opts.len > FLOW_DIS_TUN_OPTS_MAX - 4)
+		return -ERANGE;
+
 	opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len];
 	memset(opt, 0xff, option_len);
 	opt->length = data_len / 4;
-- 
Gitee


From 0ddd3ccb1873c60dfc8b49c9dd50c871f9ce978e Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:41:54 +0800
Subject: [PATCH 10/58] anolis: fuse: fix array-index-out-of-bounds in 
 fuse_update_stats()

ANBZ: #5621

The following array-index-out-of-bounds is caught on cuse:

UBSAN: array-index-out-of-bounds in fs/fuse/dev.c:284:46
index 4096 is out of range for type 'atomic64_t [50]'
Call trace:
 dump_backtrace+0x0/0x3b0
 show_stack+0x1c/0x2c
 dump_stack+0x144/0x1b8
 ubsan_epilogue+0xc/0x4c
 __ubsan_handle_out_of_bounds+0x78/0x80
 fuse_update_stats+0x148/0x164 [fuse]
 fuse_request_end+0x174/0x780 [fuse]
 end_requests+0x128/0x1d0 [fuse]
 fuse_abort_conn+0x7a8/0x9f0 [fuse]
 fuse_dev_release+0x220/0x3e0 [fuse]
 cuse_channel_release+0x1d0/0x270 [cuse]
 __fput+0x1ac/0x6b4
 ____fput+0x14/0x20
 task_work_run+0xd8/0x1f0
 do_exit+0x530/0xc9c
 do_group_exit+0xd0/0x240
 __arm64_sys_exit_group+0x40/0x50
 el0_svc_common+0x154/0x520
 do_el0_svc+0xac/0xd4
 el0_svc+0x1c/0x30
 el0_sync_handler+0xa8/0xac
 el0_sync+0x168/0x180

This is because the cuse device node shares the same file_operations,
i.e. fuse_dev_operations, with the fuse device node, and
fuse_update_stats() is called on CUSE_INIT (opcode 4096) message.

Fix this by only allowing fuse_update_stats() on normal fuse messages.

Fixes: a88ac284ae44 ("anolis: fuse: add fusectl interface to stats")
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1790
---
 fs/fuse/dev.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 893fcf28e308..7dff2da7434d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -411,17 +411,20 @@ static void flush_bg_queue(struct fuse_conn *fc)
 	}
 }
 
-static void fuse_update_stats(struct fuse_conn *fc, int opcode, uint64_t send_time)
+static void fuse_update_stats(struct fuse_conn *fc, struct fuse_req *req)
 {
+	unsigned int opcode = req->in.h.opcode;
 	uint64_t delta_time;
 
-	delta_time = get_time_now_us() - send_time;
+	if (opcode < FUSE_OP_MAX) {
+		delta_time = get_time_now_us() - req->send_time;
 
-	atomic64_add(delta_time, &fc->stats.req_time[FUSE_SUMMARY]);
-	atomic64_add(delta_time, &fc->stats.req_time[opcode]);
+		atomic64_add(delta_time, &fc->stats.req_time[FUSE_SUMMARY]);
+		atomic64_add(delta_time, &fc->stats.req_time[opcode]);
 
-	atomic64_inc(&fc->stats.req_cnts[FUSE_SUMMARY]);
-	atomic64_inc(&fc->stats.req_cnts[opcode]);
+		atomic64_inc(&fc->stats.req_cnts[FUSE_SUMMARY]);
+		atomic64_inc(&fc->stats.req_cnts[opcode]);
+	}
 }
 
 /*
@@ -445,7 +448,7 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
 	WARN_ON(test_bit(FR_PENDING, &req->flags));
 	WARN_ON(test_bit(FR_SENT, &req->flags));
 	if (test_bit(FR_BACKGROUND, &req->flags)) {
-		fuse_update_stats(fc, req->in.h.opcode, req->send_time);
+		fuse_update_stats(fc, req);
 
 		spin_lock(&fc->bg_lock);
 		clear_bit(FR_BACKGROUND, &req->flags);
@@ -563,7 +566,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 		/* Pairs with smp_wmb() in fuse_request_end() */
 		smp_rmb();
 
-		fuse_update_stats(fc, req->in.h.opcode, req->send_time);
+		fuse_update_stats(fc, req);
 	}
 }
 
-- 
Gitee


From 75728666b9b34a6a2f8972bc4bc7905c46c0d18e Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:34 +0800
Subject: [PATCH 11/58] virtio-blk: Don't use MAX_DISCARD_SEGMENTS if 
 max_discard_seg is zero

ANBZ: #5630

commit dacc73ed0b88f1a787ec20385f42ca9dd9eddcd0 upstream

Currently the value of max_discard_segment will be set to
MAX_DISCARD_SEGMENTS (256) with no basis in hardware if device
set 0 to max_discard_seg in configuration space. It's incorrect
since the device might not be able to handle such large descriptors.
To fix it, let's follow max_segments restrictions in this case.

Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support")
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Link: https://lore.kernel.org/r/20220304100058.116-1-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Fupan Li <fupan.lfp@antgroup.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1794
---
 drivers/block/virtio_blk.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 91e2487ba6de..cd6d9fed1707 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -976,9 +976,15 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
 			     &v);
+
+		/*
+		 * max_discard_seg == 0 is out of spec but we always
+		 * handled it.
+		 */
+		if (!v)
+			v = sg_elems - 2;
 		blk_queue_max_discard_segments(q,
-					       min_not_zero(v,
-							    MAX_DISCARD_SEGMENTS));
+					       min(v, MAX_DISCARD_SEGMENTS));
 
 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 	}
-- 
Gitee


From 98dac39f634a9534088fb0265365b44595ef9c30 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:35 +0800
Subject: [PATCH 12/58] virtio_blk: fix the discard_granularity and 
 discard_alignment queue limits

ANBZ: #5630

commit 62952cc5bccd89b76d710de1d0b43244af0f2903 upstream

The discard_alignment queue limit is named a bit misleading means the
offset into the block device at which the discard granularity starts.

On the other hand the discard_sector_alignment from the virtio 1.1 looks
similar to what Linux uses as discard granularity (even if not very well
described):

  "discard_sector_alignment can be used by OS when splitting a request
   based on alignment. "

And at least qemu does set it to the discard granularity.

So stop setting the discard_alignment and use the virtio
discard_sector_alignment to set the discard granularity.

Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220418045314.360785-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Fupan Li <fupan.lfp@antgroup.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1794
---
 drivers/block/virtio_blk.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cd6d9fed1707..57dc29b09f74 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -964,11 +964,12 @@ static int virtblk_probe(struct virtio_device *vdev)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
-		q->limits.discard_granularity = blk_size;
-
 		virtio_cread(vdev, struct virtio_blk_config,
 			     discard_sector_alignment, &v);
-		q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;
+		if (v)
+			q->limits.discard_granularity = v << SECTOR_SHIFT;
+		else
+			q->limits.discard_granularity = blk_size;
 
 		virtio_cread(vdev, struct virtio_blk_config,
 			     max_discard_sectors, &v);
-- 
Gitee


From 57256e903000bbeea871aa0254b0aced4216b2b0 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:36 +0800
Subject: [PATCH 13/58] anolis: virtio-blk: skip BUG_ON() in virtio_queue_rq() 
 for discard command

ANBZ: #5630

Currently we have a BUG_ON() to make sure the number of sg
list does not exceed queue_max_segments() in virtio_queue_rq().
However, the block layer uses queue_max_discard_segments()
instead of queue_max_segments() to limit the sg list for
discard requests. So the BUG_ON() might be triggered if
virtio-blk device reports a larger value for max discard
segment than queue_max_segments().

It is fixed with upstream commit e030759a1ddc ("virtio-blk: Remove
BUG_ON() in virtio_queue_rq()").
In order to reduce the code conflicts introduced by the prerequisite
commit when backporting this fix,
let's simply skip this BUG_ON() check for discard command.

Signed-off-by: Fupan Li <fupan.lfp@antgroup.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1794
---
 drivers/block/virtio_blk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 57dc29b09f74..500d392f0e7e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -304,7 +304,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	bool unmap = false;
 	u32 type;
 
-	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
+	if (req_op(req) != REQ_OP_DISCARD)
+		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
 	switch (req_op(req)) {
 	case REQ_OP_READ:
-- 
Gitee


From 8a390bd08410f99a4e3312e9db721e87a8d3d603 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:37 +0800
Subject: [PATCH 14/58] anolis: Revert "anolis: erofs,ovl: enable RCU'd 
 ->get_acl()"

ANBZ: #5670

This reverts commit 5943818023a821fa546e73c0bedc05a34299d910, where the
optimization is enabled only when lowerdirs are upon limited fs type.

Later we will cherry-pick the upstream commit 332f606b32b6 ("ovl: enable
RCU'd ->get_acl()") to re-enable this optimization.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1802
---
 fs/erofs/super.c          |  2 --
 fs/overlayfs/inode.c      | 11 +++--------
 fs/overlayfs/ovl_entry.h  |  1 -
 fs/overlayfs/super.c      |  1 -
 fs/posix_acl.c            | 13 +------------
 include/linux/fs.h        |  6 ------
 include/linux/posix_acl.h |  3 +--
 7 files changed, 5 insertions(+), 32 deletions(-)

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 29f1b22826bc..3f014bffc9dd 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -757,8 +757,6 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		sb->s_iflags &= ~SB_I_OVL_OPT_CREDS;
 
-	sb->s_iflags |= SB_I_OVL_OPT_ACL_RCU;
-
 #ifdef CONFIG_EROFS_FS_ZIP
 	INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
 #endif
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index b7f9ad289ab7..6c42823251fe 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -13,7 +13,6 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/ratelimit.h>
-#include <linux/namei.h>
 #include "overlayfs.h"
 
 
@@ -450,19 +449,15 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
 struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
 {
 	struct inode *realinode = ovl_inode_real(inode);
-	struct ovl_fs *ofs = inode->i_sb->s_fs_info;
 	const struct cred *old_cred;
 	struct posix_acl *acl;
 
+	if (rcu)
+		return ERR_PTR(-ECHILD);
+
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
 		return NULL;
 
-	if (rcu) {
-		if (!ofs->config.opt_acl_rcu)
-			return ERR_PTR(-ECHILD);
-		return get_cached_acl_rcu(realinode, type);
-	}
-
 	old_cred = ovl_override_creds_opt(inode->i_sb);
 	acl = get_acl(realinode, type);
 	ovl_revert_creds(old_cred);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index ff76cda8e1be..6f8aaf4fdb9b 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -23,7 +23,6 @@ struct ovl_config {
 	bool userxattr;
 	bool ovl_volatile;
 	bool opt_creds;
-	bool opt_acl_rcu;
 };
 
 struct ovl_sb {
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index c30ba99a4ff7..bbac42ea331d 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1914,7 +1914,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags |= SB_POSIXACL;
 
 	ofs->config.opt_creds = ovl_config_opt(ofs, SB_I_OVL_OPT_CREDS);
-	ofs->config.opt_acl_rcu = ovl_config_opt(ofs, SB_I_OVL_OPT_ACL_RCU);
 
 	err = -ENOMEM;
 	root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 2c889d2ff030..5078aa71bde8 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -21,7 +21,6 @@
 #include <linux/xattr.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
-#include <linux/namei.h>
 
 static struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
@@ -56,17 +55,7 @@ EXPORT_SYMBOL(get_cached_acl);
 
 struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
 {
-	struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type));
-
-	if (acl == ACL_DONT_CACHE) {
-		struct posix_acl *ret;
-
-		ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU);
-		if (!IS_ERR(ret))
-			acl = ret;
-	}
-
-	return acl;
+	return rcu_dereference(*acl_by_type(inode, type));
 }
 EXPORT_SYMBOL(get_cached_acl_rcu);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 34e15e9dd6f5..0308d24a5c59 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -593,11 +593,6 @@ static inline void mapping_allow_writable(struct address_space *mapping)
 
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
-/*
- * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
- * cache the ACL.  This also means that ->get_acl() can be called in RCU mode
- * with the LOOKUP_RCU flag.
- */
 #define ACL_DONT_CACHE ((void *)(-3))
 
 static inline struct posix_acl *
@@ -1396,7 +1391,6 @@ extern int send_sigurg(struct fown_struct *fown);
 
 /* hint from lowerfs for overlayfs optimizations (e.g. for container scenarios) */
 #define SB_I_OVL_OPT_CREDS	0x40000000 /* bypass [override|revert]_creds */
-#define SB_I_OVL_OPT_ACL_RCU	0x80000000 /* enable RCU'd ACL */
 
 /* Possible states of 'frozen' field */
 enum {
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 0c842d07091a..540595a321a7 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -71,8 +71,6 @@ extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
-struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
-
 #ifdef CONFIG_FS_POSIX_ACL
 extern int posix_acl_chmod(struct inode *, umode_t);
 extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
@@ -83,6 +81,7 @@ extern int simple_set_acl(struct inode *, struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);
 
 struct posix_acl *get_cached_acl(struct inode *inode, int type);
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
 void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
 void forget_cached_acl(struct inode *inode, int type);
 void forget_all_cached_acls(struct inode *inode);
-- 
Gitee


From 8e63db438a5b3bc2676a2ae5028503d1555e6513 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:39 +0800
Subject: [PATCH 15/58] anolis: Revert "anolis: erofs,ovl: bypass 
 [override|revert]_creds for overlayfs"

ANBZ: #5670

This reverts commit ca92036c0712a2bf70289e0a28307359881a5c46.

The current implementation is risky and may cause copy-up failure in
specific scenarios with "permission denied".  As this optimization is
not practically used, revert it for now.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1802
---
 fs/erofs/internal.h      |  5 -----
 fs/erofs/super.c         | 25 -------------------------
 fs/overlayfs/inode.c     |  8 ++++----
 fs/overlayfs/overlayfs.h |  2 --
 fs/overlayfs/ovl_entry.h |  1 -
 fs/overlayfs/super.c     | 18 ------------------
 fs/overlayfs/util.c      | 15 ---------------
 include/linux/fs.h       |  3 ---
 8 files changed, 4 insertions(+), 73 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1ef0dfb3d208..89eb85b94930 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -147,11 +147,6 @@ struct erofs_sb_info {
 /* Mount flags set via mount options or defaults */
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
-/*
- * Bypass [override_creds|revert_creds] for overlayfs
- * when erofs is mounted as lowerfs.
- */
-#define EROFS_MOUNT_OPT_CREDS		0x80000000
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3f014bffc9dd..e6ba90a0adc4 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -382,8 +382,6 @@ enum {
 	Opt_domain_id,
 	Opt_bootstrap_path,
 	Opt_blob_dir_path,
-	Opt_opt_creds_on,
-	Opt_opt_creds_off,
 	Opt_err
 };
 
@@ -398,8 +396,6 @@ static match_table_t erofs_tokens = {
 	{Opt_domain_id, "domain_id=%s"},
 	{Opt_bootstrap_path, "bootstrap_path=%s"},
 	{Opt_blob_dir_path, "blob_dir_path=%s"},
-	{Opt_opt_creds_on, "opt_creds=on"},
-	{Opt_opt_creds_off, "opt_creds=off"},
 	{Opt_err, NULL}
 };
 
@@ -514,12 +510,6 @@ static int erofs_parse_options(struct super_block *sb, char *options)
 				return -ENOMEM;
 			erofs_dbg("RAFS bootstrap_path %s", sbi->bootstrap_path);
 			break;
-		case Opt_opt_creds_on:
-			set_opt(EROFS_SB(sb), OPT_CREDS);
-			break;
-		case Opt_opt_creds_off:
-			clear_opt(EROFS_SB(sb), OPT_CREDS);
-			break;
 		default:
 			erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p);
 			return -EINVAL;
@@ -752,11 +742,6 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		sb->s_flags &= ~SB_POSIXACL;
 
-	if (test_opt(sbi, OPT_CREDS))
-		sb->s_iflags |= SB_I_OVL_OPT_CREDS;
-	else
-		sb->s_iflags &= ~SB_I_OVL_OPT_CREDS;
-
 #ifdef CONFIG_EROFS_FS_ZIP
 	INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
 #endif
@@ -1037,11 +1022,6 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	if (sbi->domain_id)
 		seq_printf(seq, ",domain_id=%s", sbi->domain_id);
 #endif
-	if (test_opt(sbi, OPT_CREDS))
-		seq_puts(seq, ",opt_creds=on");
-	else
-		seq_puts(seq, ",opt_creds=off");
-
 	return 0;
 }
 
@@ -1061,11 +1041,6 @@ static int erofs_remount(struct super_block *sb, int *flags, char *data)
 	else
 		sb->s_flags &= ~SB_POSIXACL;
 
-	if (test_opt(sbi, OPT_CREDS))
-		sb->s_iflags |= SB_I_OVL_OPT_CREDS;
-	else
-		sb->s_iflags &= ~SB_I_OVL_OPT_CREDS;
-
 	*flags |= SB_RDONLY;
 	return 0;
 out:
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 6c42823251fe..25c2c3513347 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -299,7 +299,7 @@ int ovl_permission(struct inode *inode, int mask)
 	if (err)
 		return err;
 
-	old_cred = ovl_override_creds_opt(inode->i_sb);
+	old_cred = ovl_override_creds(inode->i_sb);
 	if (!upperinode &&
 	    !special_file(realinode->i_mode) && mask & MAY_WRITE) {
 		mask &= ~(MAY_WRITE | MAY_APPEND);
@@ -307,7 +307,7 @@ int ovl_permission(struct inode *inode, int mask)
 		mask |= MAY_READ;
 	}
 	err = inode_permission(realinode, mask);
-	ovl_revert_creds(old_cred);
+	revert_creds(old_cred);
 
 	return err;
 }
@@ -458,9 +458,9 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
 		return NULL;
 
-	old_cred = ovl_override_creds_opt(inode->i_sb);
+	old_cred = ovl_override_creds(inode->i_sb);
 	acl = get_acl(realinode, type);
-	ovl_revert_creds(old_cred);
+	revert_creds(old_cred);
 
 	return acl;
 }
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e932bd59fdad..08ee430e4f64 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -247,8 +247,6 @@ int ovl_want_write(struct dentry *dentry);
 void ovl_drop_write(struct dentry *dentry);
 struct dentry *ovl_workdir(struct dentry *dentry);
 const struct cred *ovl_override_creds(struct super_block *sb);
-const struct cred *ovl_override_creds_opt(struct super_block *sb);
-void ovl_revert_creds(const struct cred *oldcred);
 int ovl_can_decode_fh(struct super_block *sb);
 struct dentry *ovl_indexdir(struct super_block *sb);
 bool ovl_index_all(struct super_block *sb);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 6f8aaf4fdb9b..d9f5a4cd3938 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -22,7 +22,6 @@ struct ovl_config {
 	bool metacopy;
 	bool userxattr;
 	bool ovl_volatile;
-	bool opt_creds;
 };
 
 struct ovl_sb {
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index bbac42ea331d..a79ff8f6afd8 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1774,22 +1774,6 @@ static struct dentry *ovl_get_root(struct super_block *sb,
 	return root;
 }
 
-static bool ovl_config_opt(struct ovl_fs *ofs, unsigned int opt_mask)
-{
-	int i;
-	bool opt = true;
-
-	/*
-	 * Enable the optimization for container scenarios if all lowerfs are
-	 * configured with opt_mask. The optimization is disabled by default.
-	 * ofs->numfs must be at least 2, thus the default "true" won't be
-	 * returned without checking any lowerfs.
-	 */
-	for (i = 1; opt && i < ofs->numfs; i++)
-		opt = ofs->fs[i].sb->s_iflags & opt_mask;
-	return opt;
-}
-
 static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct path upperpath = { };
@@ -1913,8 +1897,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = ofs;
 	sb->s_flags |= SB_POSIXACL;
 
-	ofs->config.opt_creds = ovl_config_opt(ofs, SB_I_OVL_OPT_CREDS);
-
 	err = -ENOMEM;
 	root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
 	if (!root_dentry)
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 138e864c07c2..681a9a47975b 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -43,21 +43,6 @@ const struct cred *ovl_override_creds(struct super_block *sb)
 	return override_creds(ofs->creator_cred);
 }
 
-const struct cred *ovl_override_creds_opt(struct super_block *sb)
-{
-	struct ovl_fs *ofs = sb->s_fs_info;
-
-	if (ofs->config.opt_creds)
-		return NULL;
-	return override_creds(ofs->creator_cred);
-}
-
-void ovl_revert_creds(const struct cred *old_cred)
-{
-	if (old_cred)
-		revert_creds(old_cred);
-}
-
 /*
  * Check if underlying fs supports file handles and try to determine encoding
  * type, in order to deduce maximum inode number used by fs.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0308d24a5c59..6a224d11c53f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1389,9 +1389,6 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
 
-/* hint from lowerfs for overlayfs optimizations (e.g. for container scenarios) */
-#define SB_I_OVL_OPT_CREDS	0x40000000 /* bypass [override|revert]_creds */
-
 /* Possible states of 'frozen' field */
 enum {
 	SB_UNFROZEN = 0,		/* FS is unfrozen */
-- 
Gitee


From 193b0613a695c66b7a76d502ece1d4f05997e2cc Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:40 +0800
Subject: [PATCH 16/58] ovl: enable RCU'd ->get_acl()

ANBZ: #5670

commit 332f606b32b6291a944c8cf23b91f53a6e676525 upstream.

Overlayfs does not cache ACL's (to avoid double caching).  Instead it just
calls the underlying filesystem's i_op->get_acl(), which will return the
cached value, if possible.

In rcu path walk, however, get_cached_acl_rcu() is employed to get the
value from the cache, which will fail on overlayfs resulting in dropping
out of rcu walk mode.  This can result in a big performance hit in certain
situations.

Fix by calling ->get_acl() with rcu=true in case of ACL_DONT_CACHE (which
indicates pass-through)

Reported-by: garyhuang <zjh.20052005@163.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1802
---
 fs/overlayfs/inode.c      |  7 ++++---
 fs/posix_acl.c            | 13 ++++++++++++-
 include/linux/fs.h        |  5 +++++
 include/linux/posix_acl.h |  3 ++-
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 25c2c3513347..477f6d521390 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -13,6 +13,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/ratelimit.h>
+#include <linux/namei.h>
 #include "overlayfs.h"
 
 
@@ -452,12 +453,12 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
 	const struct cred *old_cred;
 	struct posix_acl *acl;
 
-	if (rcu)
-		return ERR_PTR(-ECHILD);
-
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
 		return NULL;
 
+	if (rcu)
+		return get_cached_acl_rcu(realinode, type);
+
 	old_cred = ovl_override_creds(inode->i_sb);
 	acl = get_acl(realinode, type);
 	revert_creds(old_cred);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5078aa71bde8..2c889d2ff030 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -21,6 +21,7 @@
 #include <linux/xattr.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/namei.h>
 
 static struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
@@ -55,7 +56,17 @@ EXPORT_SYMBOL(get_cached_acl);
 
 struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
 {
-	return rcu_dereference(*acl_by_type(inode, type));
+	struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type));
+
+	if (acl == ACL_DONT_CACHE) {
+		struct posix_acl *ret;
+
+		ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU);
+		if (!IS_ERR(ret))
+			acl = ret;
+	}
+
+	return acl;
 }
 EXPORT_SYMBOL(get_cached_acl_rcu);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6a224d11c53f..5b7b7f31b215 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -593,6 +593,11 @@ static inline void mapping_allow_writable(struct address_space *mapping)
 
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
+/*
+ * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
+ * cache the ACL.  This also means that ->get_acl() can be called in RCU mode
+ * with the LOOKUP_RCU flag.
+ */
 #define ACL_DONT_CACHE ((void *)(-3))
 
 static inline struct posix_acl *
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 540595a321a7..0c842d07091a 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -71,6 +71,8 @@ extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
+
 #ifdef CONFIG_FS_POSIX_ACL
 extern int posix_acl_chmod(struct inode *, umode_t);
 extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
@@ -81,7 +83,6 @@ extern int simple_set_acl(struct inode *, struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);
 
 struct posix_acl *get_cached_acl(struct inode *inode, int type);
-struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
 void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
 void forget_cached_acl(struct inode *inode, int type);
 void forget_all_cached_acls(struct inode *inode);
-- 
Gitee


From 8ca7e8349093d1a094d2a2d514c5d0462982b255 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:41 +0800
Subject: [PATCH 17/58] net: add sock_init_data_uid()

ANBZ: #4278

commit 584f3742890e966d2f0a1f3c418c9ead70b2d99e upstream.

Add sock_init_data_uid() to explicitly initialize the socket uid.
To initialise the socket uid, sock_init_data() assumes a the struct
socket* sock is always embedded in a struct socket_alloc, used to
access the corresponding inode uid. This may not be true.
Examples are sockets created in tun_chr_open() and tap_open().

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Signed-off-by: Pietro Borrello <borrello@diag.uniroma1.it>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Fixes: CVE-2023-1076
Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1821
---
 include/net/sock.h |  7 ++++++-
 net/core/sock.c    | 15 ++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7e39d72d0a90..9382a53c2f6f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1682,7 +1682,12 @@ void sk_common_release(struct sock *sk);
  *	Default socket callbacks and setup code
  */
 
-/* Initialise core socket variables */
+/* Initialise core socket variables using an explicit uid. */
+void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);
+
+/* Initialise core socket variables.
+ * Assumes struct socket *sock is embedded in a struct socket_alloc.
+ */
 void sock_init_data(struct socket *sock, struct sock *sk);
 
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index 0b370f5cc23e..d4f24066eb12 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2769,7 +2769,7 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer)
 }
 EXPORT_SYMBOL(sk_stop_timer);
 
-void sock_init_data(struct socket *sock, struct sock *sk)
+void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
 {
 	sk_init_common(sk);
 	sk->sk_send_head	=	NULL;
@@ -2788,11 +2788,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 		sk->sk_type	=	sock->type;
 		sk->sk_wq	=	sock->wq;
 		sock->sk	=	sk;
-		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
 	} else {
 		sk->sk_wq	=	NULL;
-		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
 	}
+	sk->sk_uid	=	uid;
 
 	rwlock_init(&sk->sk_callback_lock);
 	if (sk->sk_kern_sock)
@@ -2850,6 +2849,16 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	refcount_set(&sk->sk_refcnt, 1);
 	atomic_set(&sk->sk_drops, 0);
 }
+EXPORT_SYMBOL(sock_init_data_uid);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+	kuid_t uid = sock ?
+		SOCK_INODE(sock)->i_uid :
+		make_kuid(sock_net(sk)->user_ns, 0);
+
+	sock_init_data_uid(sock, sk, uid);
+}
 EXPORT_SYMBOL(sock_init_data);
 
 void lock_sock_nested(struct sock *sk, int subclass)
-- 
Gitee


From b0e0e7d39c512e34889b03e007cb76c41a935954 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:42 +0800
Subject: [PATCH 18/58] tun: tun_chr_open(): correctly initialize socket uid

ANBZ: #4278

commit a096ccca6e503a5c575717ff8a36ace27510ab0a upstream.

sock_init_data() assumes that the `struct socket` passed in input is
contained in a `struct socket_alloc` allocated with sock_alloc().
However, tun_chr_open() passes a `struct socket` embedded in a `struct
tun_file` allocated with sk_alloc().
This causes a type confusion when issuing a container_of() with
SOCK_INODE() in sock_init_data() which results in assigning a wrong
sk_uid to the `struct sock` in input.
On default configuration, the type confused field overlaps with the
high 4 bytes of `struct tun_struct __rcu *tun` of `struct tun_file`,
NULL at the time of call, which makes the uid of all tun sockets 0,
i.e., the root one.
Fix the assignment by using sock_init_data_uid().

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Signed-off-by: Pietro Borrello <borrello@diag.uniroma1.it>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Fixes: CVE-2023-1076
Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1821
---
 drivers/net/tun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e1ac1c57089f..01758114b458 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3237,7 +3237,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->socket.file = file;
 	tfile->socket.ops = &tun_socket_ops;
 
-	sock_init_data(&tfile->socket, &tfile->sk);
+	sock_init_data_uid(&tfile->socket, &tfile->sk, inode->i_uid);
 
 	tfile->sk.sk_write_space = tun_sock_write_space;
 	tfile->sk.sk_sndbuf = INT_MAX;
-- 
Gitee


From b146d3a41ee817194c3fc630e4ac166c7c84d285 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:43 +0800
Subject: [PATCH 19/58] tap: tap_open(): correctly initialize socket uid

ANBZ: #4278

commit 66b2c338adce580dfce2199591e65e2bab889cff upstream.

sock_init_data() assumes that the `struct socket` passed in input is
contained in a `struct socket_alloc` allocated with sock_alloc().
However, tap_open() passes a `struct socket` embedded in a `struct
tap_queue` allocated with sk_alloc().
This causes a type confusion when issuing a container_of() with
SOCK_INODE() in sock_init_data() which results in assigning a wrong
sk_uid to the `struct sock` in input.
On default configuration, the type confused field overlaps with
padding bytes between `int vnet_hdr_sz` and `struct tap_dev __rcu
*tap` in `struct tap_queue`, which makes the uid of all tap sockets 0,
i.e., the root one.
Fix the assignment by using sock_init_data_uid().

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Signed-off-by: Pietro Borrello <borrello@diag.uniroma1.it>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Fixes: CVE-2023-1076
Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1821
---
 drivers/net/tap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f0f7cd977667..43012a195b71 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -525,7 +525,7 @@ static int tap_open(struct inode *inode, struct file *file)
 	q->sock.state = SS_CONNECTED;
 	q->sock.file = file;
 	q->sock.ops = &tap_socket_ops;
-	sock_init_data(&q->sock, &q->sk);
+	sock_init_data_uid(&q->sock, &q->sk, inode->i_uid);
 	q->sk.sk_write_space = tap_sock_write_space;
 	q->sk.sk_destruct = tap_sock_destruct;
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
-- 
Gitee


From f7e1de5053df504d0e570afaecb1b9b9f211d4b2 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:44 +0800
Subject: [PATCH 20/58] xfs: bulkstat should copy lastip whenever userspace 
 supplies one

ANBZ: #1095

commit f16fe3ecde6237256eeacb3cd582217ffe62c647 upstream

When userspace passes in a @lastip pointer we should copy the results
back, even if the @ocount pointer is NULL.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ferry Meng <mengferry@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1822
---
 fs/xfs/xfs_ioctl.c   | 13 ++++++-------
 fs/xfs/xfs_ioctl32.c | 13 ++++++-------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d7f3c03432e6..b8df9945a9f0 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -800,14 +800,13 @@ xfs_ioc_bulkstat(
 	if (error)
 		return error;
 
-	if (bulkreq.ocount != NULL) {
-		if (copy_to_user(bulkreq.lastip, &inlast,
-						sizeof(xfs_ino_t)))
-			return -EFAULT;
+	if (bulkreq.lastip != NULL &&
+	    copy_to_user(bulkreq.lastip, &inlast, sizeof(xfs_ino_t)))
+		return -EFAULT;
 
-		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-			return -EFAULT;
-	}
+	if (bulkreq.ocount != NULL &&
+	    copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+		return -EFAULT;
 
 	return 0;
 }
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index b044f7d36782..71a8874a579b 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -313,14 +313,13 @@ xfs_compat_ioc_bulkstat(
 	if (error)
 		return error;
 
-	if (bulkreq.ocount != NULL) {
-		if (copy_to_user(bulkreq.lastip, &inlast,
-						sizeof(xfs_ino_t)))
-			return -EFAULT;
+	if (bulkreq.lastip != NULL &&
+	    copy_to_user(bulkreq.lastip, &inlast, sizeof(xfs_ino_t)))
+		return -EFAULT;
 
-		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-			return -EFAULT;
-	}
+	if (bulkreq.ocount != NULL &&
+	    copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+		return -EFAULT;
 
 	return 0;
 }
-- 
Gitee


From c5dfbe42618355531e8c289471a84a67df65b3ef Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:45 +0800
Subject: [PATCH 21/58] ovl: add splice file read write helper

ANBZ: #1217

commit 1a980b8cbf0059a5308eea61522f232fd03002e2 upstream

Now overlayfs falls back to use default file splice read
and write, which is not compatiple with overlayfs, returning
EFAULT. xfstests generic/591 can reproduce part of this.

Tested this patch with xfstests auto group tests.

Signed-off-by: Murphy Zhou <jencce.kernel@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Ferry Meng <mengferry@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1823
---
 fs/overlayfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index db3dd10632bf..e089d6f40986 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -12,6 +12,9 @@
 #include <linux/xattr.h>
 #include <linux/uio.h>
 #include <linux/uaccess.h>
+#include <linux/splice.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
 #include "overlayfs.h"
 
 struct ovl_aio_req {
@@ -397,6 +400,48 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	return ret;
 }
 
+static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
+{
+	ssize_t ret;
+	struct fd real;
+	const struct cred *old_cred;
+
+	ret = ovl_real_fdget(in, &real);
+	if (ret)
+		return ret;
+
+	old_cred = ovl_override_creds(file_inode(in)->i_sb);
+	ret = generic_file_splice_read(real.file, ppos, pipe, len, flags);
+	revert_creds(old_cred);
+
+	ovl_file_accessed(in);
+	fdput(real);
+	return ret;
+}
+
+static ssize_t
+ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct fd real;
+	const struct cred *old_cred;
+	ssize_t ret;
+
+	ret = ovl_real_fdget(out, &real);
+	if (ret)
+		return ret;
+
+	old_cred = ovl_override_creds(file_inode(out)->i_sb);
+	ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+	revert_creds(old_cred);
+
+	ovl_file_accessed(out);
+	fdput(real);
+	return ret;
+}
+
 static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct fd real;
@@ -750,6 +795,8 @@ const struct file_operations ovl_file_operations = {
 	.fadvise	= ovl_fadvise,
 	.unlocked_ioctl	= ovl_ioctl,
 	.compat_ioctl	= ovl_compat_ioctl,
+	.splice_read    = ovl_splice_read,
+	.splice_write   = ovl_splice_write,
 
 	.copy_file_range	= ovl_copy_file_range,
 	.clone_file_range	= ovl_clone_file_range,
-- 
Gitee


From c60efdd25a5a03b4474862850baf14f9b2b11a9f Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:45 +0800
Subject: [PATCH 22/58] locks: Use inode_is_open_for_write

ANBZ: #1216

commit 052b8cfa4070caa53125cd589da0cfe744132a94 upstream

Use the aptly named function rather than open coding it. No functional
changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ferry Meng <mengferry@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1824
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/locks.c b/fs/locks.c
index 552476d6f6bb..d21cfa1305bf 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1646,7 +1646,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 	if (flags & FL_LAYOUT)
 		return 0;
 
-	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+	if ((arg == F_RDLCK) && inode_is_open_for_write(inode))
 		return -EAGAIN;
 
 	if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
-- 
Gitee


From 7e977b7a7c07356c7b35a7f479c8e82c5d19d74e Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:46 +0800
Subject: [PATCH 23/58] locks: eliminate false positive conflicts for write 
 lease

ANBZ: #1216

commit 387e3746d01c34457d6a73688acd90428725070b upstream

check_conflicting_open() is checking for existing fd's open for read or
for write before allowing to take a write lease.  The check that was
implemented using i_count and d_count is an approximation that has
several false positives.  For example, overlayfs since v4.19, takes an
extra reference on the dentry; An open with O_PATH takes a reference on
the dentry although the file cannot be read nor written.

Change the implementation to use i_readcount and i_writecount to
eliminate the false positive conflicts and allow a write lease to be
taken on an overlayfs file.

The change of behavior with existing fd's open with O_PATH is symmetric
w.r.t. current behavior of lease breakers - an open with O_PATH currently
does not break a write lease.

This increases the size of struct inode by 4 bytes on 32bit archs when
CONFIG_FILE_LOCKING is defined and CONFIG_IMA was not already
defined.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ferry Meng <mengferry@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1824
---
 fs/locks.c         | 46 +++++++++++++++++++++++++++++-----------------
 include/linux/fs.h |  4 ++--
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index d21cfa1305bf..21b05a053f33 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1627,10 +1627,10 @@ int fcntl_getlease(struct file *filp)
 }
 
 /**
- * check_conflicting_open - see if the given dentry points to a file that has
- * 			    an existing open that would conflict with the
- * 			    desired lease.
- * @dentry:	dentry to check
+ * check_conflicting_open - see if the given file points to an inode that has
+ *			    an existing open that would conflict with the
+ *			    desired lease.
+ * @filp:	file to check
  * @arg:	type of lease that we're trying to acquire
  * @flags:	current lock flags
  *
@@ -1638,30 +1638,42 @@ int fcntl_getlease(struct file *filp)
  * conflict with the lease we're trying to set.
  */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
+check_conflicting_open(struct file *filp, const long arg, int flags)
 {
-	int ret = 0;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = locks_inode(filp);
+	int self_wcount = 0, self_rcount = 0;
 
 	if (flags & FL_LAYOUT)
 		return 0;
 
-	if ((arg == F_RDLCK) && inode_is_open_for_write(inode))
-		return -EAGAIN;
+	if (arg == F_RDLCK)
+		return inode_is_open_for_write(inode) ? -EAGAIN : 0;
+	else if (arg != F_WRLCK)
+		return 0;
+
+	/*
+	 * Make sure that only read/write count is from lease requestor.
+	 * Note that this will result in denying write leases when i_writecount
+	 * is negative, which is what we want.  (We shouldn't grant write leases
+	 * on files open for execution.)
+	 */
+	if (filp->f_mode & FMODE_WRITE)
+		self_wcount = 1;
+	else if (filp->f_mode & FMODE_READ)
+		self_rcount = 1;
 
-	if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
-	    (atomic_read(&inode->i_count) > 1)))
-		ret = -EAGAIN;
+	if (atomic_read(&inode->i_writecount) != self_wcount ||
+	    atomic_read(&inode->i_readcount) != self_rcount)
+		return -EAGAIN;
 
-	return ret;
+	return 0;
 }
 
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
 	struct file_lock *fl, *my_fl = NULL, *lease;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = locks_inode(filp);
 	struct file_lock_context *ctx;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
@@ -1696,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->fl_flags);
 	if (error)
 		goto out;
 
@@ -1753,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->fl_flags);
 	if (error) {
 		locks_unlink_lock_ctx(lease);
 		goto out;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b7b7f31b215..12ab65298082 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -701,7 +701,7 @@ struct inode {
 	atomic_t		i_count;
 	atomic_t		i_dio_count;
 	atomic_t		i_writecount;
-#ifdef CONFIG_IMA
+#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
 	atomic_t		i_readcount; /* struct files open RO */
 #endif
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
@@ -2926,7 +2926,7 @@ static inline bool inode_is_open_for_write(const struct inode *inode)
 	return atomic_read(&inode->i_writecount) > 0;
 }
 
-#ifdef CONFIG_IMA
+#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
 static inline void i_readcount_dec(struct inode *inode)
 {
 	BUG_ON(!atomic_read(&inode->i_readcount));
-- 
Gitee


From 31e8bdb362a5335f644a4504d4186dc25866123e Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:47 +0800
Subject: [PATCH 24/58] memstick: r592: Fix UAF bug in r592_remove due to race 
 condition

ANBZ: #5685

commit 63264422785021704c39b38f65a78ab9e4a186d7 upstream.

In r592_probe, dev->detect_timer was bound with r592_detect_timer.
In r592_irq function, the timer function will be invoked by mod_timer.

If we remove the module which will call hantro_release to make cleanup,
there may be a unfinished work. The possible sequence is as follows,
which will cause a typical UAF bug.

Fix it by canceling the work before cleanup in r592_remove.

CPU0                  CPU1

                    |r592_detect_timer
r592_remove         |
  memstick_free_host|
  put_device;       |
  kfree(host);      |
                    |
                    | queue_work
                    |   &host->media_checker //use

Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Link: https://lore.kernel.org/r/20230307164338.1246287-1-zyytlz.wz@163.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>

Fixes: CVE-2023-35825
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1820
---
 drivers/memstick/host/r592.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c
index 627d6e62fe31..61b9c97cf0e7 100644
--- a/drivers/memstick/host/r592.c
+++ b/drivers/memstick/host/r592.c
@@ -827,7 +827,7 @@ static void r592_remove(struct pci_dev *pdev)
 	/* Stop the processing thread.
 	That ensures that we won't take any more requests */
 	kthread_stop(dev->io_thread);
-
+	del_timer_sync(&dev->detect_timer);
 	r592_enable_device(dev, false);
 
 	while (!error && dev->req) {
-- 
Gitee


From 0e5daacc9d045688c226034055b5f03d10205d12 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:48 +0800
Subject: [PATCH 25/58] drm/msm/dpu: Add check for pstates

ANBZ: #5686

commit 93340e10b9c5fc86730d149636e0aa8b47bb5a34 upstream.

As kzalloc may fail and return NULL pointer,
it should be better to check pstates
in order to avoid the NULL pointer dereference.

Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support")
Signed-off-by: Jiasheng Jiang <jiasheng@iscas.ac.cn>
Reviewed-by: Abhinav Kumar <quic_abhinavk@quicinc.com>
Patchwork: https://patchwork.freedesktop.org/patch/514160/
Link: https://lore.kernel.org/r/20221206080236.43687-1-jiasheng@iscas.ac.cn
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-3220
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1818
---
 drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c
index 4752f08f0884..5852e1d356e1 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c
@@ -1477,6 +1477,8 @@ static int dpu_crtc_atomic_check(struct drm_crtc *crtc,
 	}
 
 	pstates = kzalloc(sizeof(*pstates) * DPU_STAGE_MAX * 4, GFP_KERNEL);
+	if (!pstates)
+		return -ENOMEM;
 
 	dpu_crtc = to_dpu_crtc(crtc);
 	cstate = to_dpu_crtc_state(state);
-- 
Gitee


From 41e59f04eeda2a890143029c2edb0aed6459bcc0 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:49 +0800
Subject: [PATCH 26/58] x86/bugs: Flush IBP in ib_prctl_set()

ANBZ: #5369

commit a664ec9158eeddd75121d39c9a0758016097fa96 upstream.

We missed the window between the TIF flag update and the next reschedule.

Signed-off-by: Rodrigo Branco <bsdaemon@google.com>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: <stable@vger.kernel.org>

Fixes: CVE-2023-0045
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1741
---
 arch/x86/kernel/cpu/bugs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index fe960ebd54a5..00dd932d386a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1589,6 +1589,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
 		if (ctrl == PR_SPEC_FORCE_DISABLE)
 			task_set_spec_ib_force_disable(task);
 		task_update_spec_tif(task);
+		if (task == current)
+			indirect_branch_prediction_barrier();
 		break;
 	default:
 		return -ERANGE;
-- 
Gitee


From a536012d0a76e2229e223f015635304128c1768d Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:50 +0800
Subject: [PATCH 27/58] drm/vmwgfx: Validate the box size for the snooped 
 cursor

ANBZ: #3944

commit 4cf949c7fafe21e085a4ee386bb2dade9067316e upstream.

Invalid userspace dma surface copies could potentially overflow
the memcpy from the surface to the snooped image leading to crashes.
To fix it the dimensions of the copybox have to be validated
against the expected size of the snooped cursor.

Signed-off-by: Zack Rusin <zackr@vmware.com>
Fixes: 2ac863719e51 ("vmwgfx: Snoop DMA transfers with non-covering sizes")
Cc: <stable@vger.kernel.org> # v3.2+
Reviewed-by: Michael Banack <banackm@vmware.com>
Reviewed-by: Martin Krastev <krastevm@vmware.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221026031936.1004280-1-zack@kde.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fixes: CVE-2022-36280
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1736
---
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index 248d92c85cf6..befef87e0bb8 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -179,7 +179,8 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf,
 	if (cmd->dma.guest.ptr.offset % PAGE_SIZE ||
 	    box->x != 0    || box->y != 0    || box->z != 0    ||
 	    box->srcx != 0 || box->srcy != 0 || box->srcz != 0 ||
-	    box->d != 1    || box_count != 1) {
+	    box->d != 1    || box_count != 1 ||
+	    box->w > 64 || box->h > 64) {
 		/* TODO handle none page aligned offsets */
 		/* TODO handle more dst & src != 0 */
 		/* TODO handle more then one copy */
-- 
Gitee


From 8372d66270354caf740a832fd096fe2943450bf4 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:51 +0800
Subject: [PATCH 28/58] i2c: xgene-slimpro: Fix out-of-bounds bug in 
 xgene_slimpro_i2c_xfer()

ANBZ: #4901

commit 92fbb6d1296f81f41f65effd7f5f8c0f74943d15 upstream.

The data->block[0] variable comes from user and is a number between
0-255. Without proper check, the variable may be very large to cause
an out-of-bounds when performing memcpy in slimpro_i2c_blkwr.

Fix this bug by checking the value of writelen.

Fixes: f6505fbabc42 ("i2c: add SLIMpro I2C device driver on APM X-Gene platform")
Signed-off-by: Wei Chen <harperchen1110@gmail.com>
Cc: stable@vger.kernel.org
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fixes: CVE-2023-2194
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1735
---
 drivers/i2c/busses/i2c-xgene-slimpro.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/i2c/busses/i2c-xgene-slimpro.c b/drivers/i2c/busses/i2c-xgene-slimpro.c
index a7ac746018ad..7a746f413535 100644
--- a/drivers/i2c/busses/i2c-xgene-slimpro.c
+++ b/drivers/i2c/busses/i2c-xgene-slimpro.c
@@ -321,6 +321,9 @@ static int slimpro_i2c_blkwr(struct slimpro_i2c_dev *ctx, u32 chip,
 	u32 msg[3];
 	int rc;
 
+	if (writelen > I2C_SMBUS_BLOCK_MAX)
+		return -EINVAL;
+
 	memcpy(ctx->dma_buffer, data, writelen);
 	paddr = dma_map_single(ctx->dev, ctx->dma_buffer, writelen,
 			       DMA_TO_DEVICE);
-- 
Gitee


From df685b2b9d0690d299f3de7404fe53ca26693028 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:52 +0800
Subject: [PATCH 29/58] media: dvb-usb: az6027: fix null-ptr-deref in 
 az6027_i2c_xfer()

ANBZ: #4812

commit 0ed554fd769a19ea8464bb83e9ac201002ef74ad upstream.

Wei Chen reports a kernel bug as blew:

general protection fault, probably for non-canonical address
KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
...
Call Trace:
<TASK>
__i2c_transfer+0x77e/0x1930 drivers/i2c/i2c-core-base.c:2109
i2c_transfer+0x1d5/0x3d0 drivers/i2c/i2c-core-base.c:2170
i2cdev_ioctl_rdwr+0x393/0x660 drivers/i2c/i2c-dev.c:297
i2cdev_ioctl+0x75d/0x9f0 drivers/i2c/i2c-dev.c:458
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:870 [inline]
__se_sys_ioctl+0xfb/0x170 fs/ioctl.c:856
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fd834a8bded

In az6027_i2c_xfer(), if msg[i].addr is 0x99,
a null-ptr-deref will caused when accessing msg[i].buf.
For msg[i].len is 0 and msg[i].buf is null.

Fix this by checking msg[i].len in az6027_i2c_xfer().

Link: https://lore.kernel.org/lkml/CAO4mrfcPHB5aQJO=mpqV+p8mPLNg-Fok0gw8gZ=zemAfMGTzMg@mail.gmail.com/

Link: https://lore.kernel.org/linux-media/20221120065918.2160782-1-zhongbaisong@huawei.com
Fixes: 76f9a820c867 ("V4L/DVB: AZ6027: Initial import of the driver")
Reported-by: Wei Chen <harperchen1110@gmail.com>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-28328
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1600
---
 drivers/media/usb/dvb-usb/az6027.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/media/usb/dvb-usb/az6027.c b/drivers/media/usb/dvb-usb/az6027.c
index 6321b8e30261..555c8ac44881 100644
--- a/drivers/media/usb/dvb-usb/az6027.c
+++ b/drivers/media/usb/dvb-usb/az6027.c
@@ -977,6 +977,10 @@ static int az6027_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int n
 		if (msg[i].addr == 0x99) {
 			req = 0xBE;
 			index = 0;
+			if (msg[i].len < 1) {
+				i = -EOPNOTSUPP;
+				break;
+			}
 			value = msg[i].buf[0] & 0x00ff;
 			length = 1;
 			az6027_usb_out_op(d, req, value, index, data, length);
-- 
Gitee


From 7e626df92e1d33227450f350f4fb6e83be0f4602 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:52 +0800
Subject: [PATCH 30/58] media: uvcvideo: Avoid cyclic entity chains due to 
 malformed USB descriptors

ANBZ: #4485

commit 68035c80e129c4cfec659aac4180354530b26527 upstream.

Way back in 2017, fuzzing the 4.14-rc2 USB stack with syzkaller kicked
up the following WARNING from the UVC chain scanning code:

  | list_add double add: new=ffff880069084010, prev=ffff880069084010,
  | next=ffff880067d22298.
  | ------------[ cut here ]------------
  | WARNING: CPU: 1 PID: 1846 at lib/list_debug.c:31 __list_add_valid+0xbd/0xf0
  | Modules linked in:
  | CPU: 1 PID: 1846 Comm: kworker/1:2 Not tainted
  | 4.14.0-rc2-42613-g1488251d1a98 #238
  | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  | Workqueue: usb_hub_wq hub_event
  | task: ffff88006b01ca40 task.stack: ffff880064358000
  | RIP: 0010:__list_add_valid+0xbd/0xf0 lib/list_debug.c:29
  | RSP: 0018:ffff88006435ddd0 EFLAGS: 00010286
  | RAX: 0000000000000058 RBX: ffff880067d22298 RCX: 0000000000000000
  | RDX: 0000000000000058 RSI: ffffffff85a58800 RDI: ffffed000c86bbac
  | RBP: ffff88006435dde8 R08: 1ffff1000c86ba52 R09: 0000000000000000
  | R10: 0000000000000002 R11: 0000000000000000 R12: ffff880069084010
  | R13: ffff880067d22298 R14: ffff880069084010 R15: ffff880067d222a0
  | FS:  0000000000000000(0000) GS:ffff88006c900000(0000) knlGS:0000000000000000
  | CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  | CR2: 0000000020004ff2 CR3: 000000006b447000 CR4: 00000000000006e0
  | Call Trace:
  |  __list_add ./include/linux/list.h:59
  |  list_add_tail+0x8c/0x1b0 ./include/linux/list.h:92
  |  uvc_scan_chain_forward.isra.8+0x373/0x416
  | drivers/media/usb/uvc/uvc_driver.c:1471
  |  uvc_scan_chain drivers/media/usb/uvc/uvc_driver.c:1585
  |  uvc_scan_device drivers/media/usb/uvc/uvc_driver.c:1769
  |  uvc_probe+0x77f2/0x8f00 drivers/media/usb/uvc/uvc_driver.c:2104

Looking into the output from usbmon, the interesting part is the
following data packet:

  ffff880069c63e00 30710169 C Ci:1:002:0 0 143 = 09028f00 01030080
  00090403 00000e01 00000924 03000103 7c003328 010204db

If we drop the lead configuration and interface descriptors, we're left
with an output terminal descriptor describing a generic display:

  /* Output terminal descriptor */
  buf[0]	09
  buf[1]	24
  buf[2]	03	/* UVC_VC_OUTPUT_TERMINAL */
  buf[3]	00	/* ID */
  buf[4]	01	/* type == 0x0301 (UVC_OTT_DISPLAY) */
  buf[5]	03
  buf[6]	7c
  buf[7]	00	/* source ID refers to self! */
  buf[8]	33

The problem with this descriptor is that it is self-referential: the
source ID of 0 matches itself! This causes the 'struct uvc_entity'
representing the display to be added to its chain list twice during
'uvc_scan_chain()': once via 'uvc_scan_chain_entity()' when it is
processed directly from the 'dev->entities' list and then again
immediately afterwards when trying to follow the source ID in
'uvc_scan_chain_forward()'

Add a check before adding an entity to a chain list to ensure that the
entity is not already part of a chain.

Cc: <stable@vger.kernel.org>
Fixes: c0efd232929c ("V4L/DVB (8145a): USB Video Class driver")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: shi.xue <shi.xue@zte.com.cn>
Fixes:CVE-2020-0404
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1439
---
 drivers/media/usb/uvc/uvc_driver.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index 063e229ead5e..38c73cdbef70 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -1482,6 +1482,11 @@ static int uvc_scan_chain_forward(struct uvc_video_chain *chain,
 			break;
 		if (forward == prev)
 			continue;
+		if (forward->chain.next || forward->chain.prev) {
+			uvc_trace(UVC_TRACE_DESCR, "Found reference to "
+				"entity %d already in chain.\n", forward->id);
+			return -EINVAL;
+		}
 
 		switch (UVC_ENTITY_TYPE(forward)) {
 		case UVC_VC_EXTENSION_UNIT:
@@ -1563,6 +1568,13 @@ static int uvc_scan_chain_backward(struct uvc_video_chain *chain,
 				return -1;
 			}
 
+			if (term->chain.next || term->chain.prev) {
+				uvc_trace(UVC_TRACE_DESCR, "Found reference to "
+					"entity %d already in chain.\n",
+					term->id);
+				return -EINVAL;
+			}
+
 			if (uvc_trace_param & UVC_TRACE_PROBE)
 				printk(KERN_CONT " %d", term->id);
 
-- 
Gitee


From 2b6d6ece15e1de98aab47675f6f7fd0e98b4acdd Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:53 +0800
Subject: [PATCH 31/58] xirc2ps_cs: Fix use after free bug in xirc2ps_detach

ANBZ: #4685

commit e8d20c3ded59a092532513c9bd030d1ea66f5f44 upstream.

In xirc2ps_probe, the local->tx_timeout_task was bounded
with xirc2ps_tx_timeout_task. When timeout occurs,
it will call xirc_tx_timeout->schedule_work to start the
work.

When we call xirc2ps_detach to remove the driver, there
may be a sequence as follows:

Stop responding to timeout tasks and complete scheduled
tasks before cleanup in xirc2ps_detach, which will fix
the problem.

CPU0                  CPU1

                    |xirc2ps_tx_timeout_task
xirc2ps_detach      |
  free_netdev       |
    kfree(dev);     |
                    |
                    | do_reset
                    |   //use dev

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-1670
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1603
---
 drivers/net/ethernet/xircom/xirc2ps_cs.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c
index fd5288ff53b5..e3438cef5f9c 100644
--- a/drivers/net/ethernet/xircom/xirc2ps_cs.c
+++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c
@@ -503,6 +503,11 @@ static void
 xirc2ps_detach(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
+    struct local_info *local = netdev_priv(dev);
+
+    netif_carrier_off(dev);
+    netif_tx_disable(dev);
+    cancel_work_sync(&local->tx_timeout_task);
 
     dev_dbg(&link->dev, "detach\n");
 
-- 
Gitee


From 857e5460716f049a4d4c1ad0f342895e237aa947 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:54 +0800
Subject: [PATCH 32/58] prlimit: do_prlimit needs to have a speculation check

ANBZ: #4933

commit 739790605705ddcf18f21782b9c99ad7d53a8c11 upstream.

do_prlimit() adds the user-controlled resource value to a pointer that
will subsequently be dereferenced.  In order to help prevent this
codepath from being used as a spectre "gadget" a barrier needs to be
added after checking the range.

Reported-by: Jordy Zomer <jordyzomer@google.com>
Tested-by: Jordy Zomer <jordyzomer@google.com>
Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fixes: CVE-2023-0458
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1731
---
 kernel/sys.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sys.c b/kernel/sys.c
index 6f7a0c9e83f3..3c990df3498e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1531,6 +1531,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
+	resource = array_index_nospec(resource, RLIM_NLIMITS);
+
 	if (new_rlim) {
 		if (new_rlim->rlim_cur > new_rlim->rlim_max)
 			return -EINVAL;
-- 
Gitee


From 818ef06844e9943c243c4eb716247d848ba5d743 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:55 +0800
Subject: [PATCH 33/58] nfc: st-nci: Fix use after free bug in ndlc_remove due 
 to race condition

ANBZ: #4780

commit 5000fe6c27827a61d8250a7e4a1d26c3298ef4f6 upstream.

This bug influences both st_nci_i2c_remove and st_nci_spi_remove.
Take st_nci_i2c_remove as an example.

In st_nci_i2c_probe, it called ndlc_probe and bound &ndlc->sm_work
with llt_ndlc_sm_work.

When it calls ndlc_recv or timeout handler, it will finally call
schedule_work to start the work.

When we call st_nci_i2c_remove to remove the driver, there
may be a sequence as follows:

Fix it by finishing the work before cleanup in ndlc_remove

CPU0                  CPU1

                    |llt_ndlc_sm_work
st_nci_i2c_remove   |
  ndlc_remove       |
     st_nci_remove  |
     nci_free_device|
     kfree(ndev)    |
//free ndlc->ndev   |
                    |llt_ndlc_rcv_queue
                    |nci_recv_frame
                    |//use ndlc->ndev

Fixes: 35630df68d60 ("NFC: st21nfcb: Add driver for STMicroelectronics ST21NFCB NFC chip")
Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20230312160837.2040857-1-zyytlz.wz@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-1990
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1578
---
 drivers/nfc/st-nci/ndlc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c
index f26d938d240f..12d73f9dbe9f 100644
--- a/drivers/nfc/st-nci/ndlc.c
+++ b/drivers/nfc/st-nci/ndlc.c
@@ -297,13 +297,15 @@ EXPORT_SYMBOL(ndlc_probe);
 
 void ndlc_remove(struct llt_ndlc *ndlc)
 {
-	st_nci_remove(ndlc->ndev);
-
 	/* cancel timers */
 	del_timer_sync(&ndlc->t1_timer);
 	del_timer_sync(&ndlc->t2_timer);
 	ndlc->t2_active = false;
 	ndlc->t1_active = false;
+	/* cancel work */
+	cancel_work_sync(&ndlc->sm_work);
+
+	st_nci_remove(ndlc->ndev);
 
 	skb_queue_purge(&ndlc->rcv_q);
 	skb_queue_purge(&ndlc->send_q);
-- 
Gitee


From ee1c51d6790a40c5dc3a575a77e2a6c917d08e52 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:56 +0800
Subject: [PATCH 34/58] HID: roccat: Fix use-after-free in roccat_read()

ANBZ: #5372

commit cacdb14b1c8d3804a3a7d31773bc7569837b71a4 upstream.

roccat_report_event() is responsible for registering
roccat-related reports in struct roccat_device.

int roccat_report_event(int minor, u8 const *data)
{
	struct roccat_device *device;
	struct roccat_reader *reader;
	struct roccat_report *report;
	uint8_t *new_value;

	device = devices[minor];

	new_value = kmemdup(data, device->report_size, GFP_ATOMIC);
	if (!new_value)
		return -ENOMEM;

	report = &device->cbuf[device->cbuf_end];

	/* passing NULL is safe */
	kfree(report->value);
	...

The registered report is stored in the struct roccat_device member
"struct roccat_report cbuf[ROCCAT_CBUF_SIZE];".
If more reports are received than the "ROCCAT_CBUF_SIZE" value,
kfree() the saved report from cbuf[0] and allocates a new reprot.
Since there is no lock when this kfree() is performed,
kfree() can be performed even while reading the saved report.

static ssize_t roccat_read(struct file *file, char __user *buffer,
		size_t count, loff_t *ppos)
{
	struct roccat_reader *reader = file->private_data;
	struct roccat_device *device = reader->device;
	struct roccat_report *report;
	ssize_t retval = 0, len;
	DECLARE_WAITQUEUE(wait, current);

	mutex_lock(&device->cbuf_lock);

	...

	report = &device->cbuf[reader->cbuf_start];
	/*
	 * If report is larger than requested amount of data, rest of report
	 * is lost!
	 */
	len = device->report_size > count ? count : device->report_size;

	if (copy_to_user(buffer, report->value, len)) {
		retval = -EFAULT;
		goto exit_unlock;
	}
	...

The roccat_read() function receives the device->cbuf report and
delivers it to the user through copy_to_user().
If the N+ROCCAT_CBUF_SIZE th report is received while copying of
the Nth report->value is in progress, the pointer that copy_to_user()
is working on is kfree()ed and UAF read may occur. (race condition)

Since the device node of this driver does not set separate permissions,
this is not a security vulnerability, but because it is used for
requesting screen display of profile or dpi settings,
a user using the roccat device can apply udev to this device node or
There is a possibility to use it by giving.

Signed-off-by: Hyunwoo Kim <imv4bel@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2022-41850
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1727
---
 drivers/hid/hid-roccat.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c
index 5be8de70c651..c9cec00b4e6e 100644
--- a/drivers/hid/hid-roccat.c
+++ b/drivers/hid/hid-roccat.c
@@ -260,6 +260,8 @@ int roccat_report_event(int minor, u8 const *data)
 	if (!new_value)
 		return -ENOMEM;
 
+	mutex_lock(&device->cbuf_lock);
+
 	report = &device->cbuf[device->cbuf_end];
 
 	/* passing NULL is safe */
@@ -279,6 +281,8 @@ int roccat_report_event(int minor, u8 const *data)
 			reader->cbuf_start = (reader->cbuf_start + 1) % ROCCAT_CBUF_SIZE;
 	}
 
+	mutex_unlock(&device->cbuf_lock);
+
 	wake_up_interruptible(&device->wait);
 	return 0;
 }
-- 
Gitee


From 738c4f91f6774eaa778371f5aea7b6aa43ac9cb2 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:57 +0800
Subject: [PATCH 35/58] hwmon: (xgene) Fix use after free bug in 
 xgene_hwmon_remove due to race condition

ANBZ: #4787

commit cb090e64cf25602b9adaf32d5dfc9c8bec493cd1 upstream.

In xgene_hwmon_probe, &ctx->workq is bound with xgene_hwmon_evt_work.
Then it will be started.

If we remove the driver which will call xgene_hwmon_remove to clean up,
there may be unfinished work.

The possible sequence is as follows:

Fix it by finishing the work before cleanup in xgene_hwmon_remove.

CPU0                  CPU1

                    |xgene_hwmon_evt_work
xgene_hwmon_remove   |
kfifo_free(&ctx->async_msg_fifo);|
                    |
                    |kfifo_out_spinlocked
                    |//use &ctx->async_msg_fifo
Fixes: 2ca492e22cb7 ("hwmon: (xgene) Fix crash when alarm occurs before driver probe")
Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Link: https://lore.kernel.org/r/20230310084007.1403388-1-zyytlz.wz@163.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-1855
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1571
---
 drivers/hwmon/xgene-hwmon.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c
index a3cd91f23267..2dd19a420305 100644
--- a/drivers/hwmon/xgene-hwmon.c
+++ b/drivers/hwmon/xgene-hwmon.c
@@ -780,6 +780,7 @@ static int xgene_hwmon_remove(struct platform_device *pdev)
 {
 	struct xgene_hwmon_dev *ctx = platform_get_drvdata(pdev);
 
+	cancel_work_sync(&ctx->workq);
 	hwmon_device_unregister(ctx->hwmon_dev);
 	kfifo_free(&ctx->async_msg_fifo);
 	if (acpi_disabled)
-- 
Gitee


From 0ea32ec5746e7be537c9b791d1e8da68537ce922 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:58 +0800
Subject: [PATCH 36/58] staging: rtl8712: fix use after free bugs

ANBZ: #4597

commit e230a4455ac3e9b112f0367d1b8e255e141afae0 upstream.

_Read/Write_MACREG callbacks are NULL so the read/write_macreg_hdl()
functions don't do anything except free the "pcmd" pointer.  It
results in a use after free.  Delete them.

Fixes: 2865d42c78a9 ("staging: r8712u: Add the new driver to the mainline kernel")
Cc: stable <stable@kernel.org>
Reported-by: Zheng Wang <hackerzheng666@gmail.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/Yw4ASqkYcUhUfoY2@kili
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fixes: CVE-2022-4095
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1522
---
 drivers/staging/rtl8712/rtl8712_cmd.c | 36 ---------------------------
 1 file changed, 36 deletions(-)

diff --git a/drivers/staging/rtl8712/rtl8712_cmd.c b/drivers/staging/rtl8712/rtl8712_cmd.c
index 63bc811681d9..fb092d4ec521 100644
--- a/drivers/staging/rtl8712/rtl8712_cmd.c
+++ b/drivers/staging/rtl8712/rtl8712_cmd.c
@@ -129,34 +129,6 @@ static void r871x_internal_cmd_hdl(struct _adapter *padapter, u8 *pbuf)
 	kfree(pdrvcmd->pbuf);
 }
 
-static u8 read_macreg_hdl(struct _adapter *padapter, u8 *pbuf)
-{
-	void (*pcmd_callback)(struct _adapter *dev, struct cmd_obj	*pcmd);
-	struct cmd_obj *pcmd  = (struct cmd_obj *)pbuf;
-
-	/*  invoke cmd->callback function */
-	pcmd_callback = cmd_callback[pcmd->cmdcode].callback;
-	if (!pcmd_callback)
-		r8712_free_cmd_obj(pcmd);
-	else
-		pcmd_callback(padapter, pcmd);
-	return H2C_SUCCESS;
-}
-
-static u8 write_macreg_hdl(struct _adapter *padapter, u8 *pbuf)
-{
-	void (*pcmd_callback)(struct _adapter *dev, struct cmd_obj	*pcmd);
-	struct cmd_obj *pcmd  = (struct cmd_obj *)pbuf;
-
-	/*  invoke cmd->callback function */
-	pcmd_callback = cmd_callback[pcmd->cmdcode].callback;
-	if (!pcmd_callback)
-		r8712_free_cmd_obj(pcmd);
-	else
-		pcmd_callback(padapter, pcmd);
-	return H2C_SUCCESS;
-}
-
 static u8 read_bbreg_hdl(struct _adapter *padapter, u8 *pbuf)
 {
 	struct cmd_obj *pcmd  = (struct cmd_obj *)pbuf;
@@ -225,14 +197,6 @@ static struct cmd_obj *cmd_hdl_filter(struct _adapter *padapter,
 	pcmd_r = NULL;
 
 	switch (pcmd->cmdcode) {
-	case GEN_CMD_CODE(_Read_MACREG):
-		read_macreg_hdl(padapter, (u8 *)pcmd);
-		pcmd_r = pcmd;
-		break;
-	case GEN_CMD_CODE(_Write_MACREG):
-		write_macreg_hdl(padapter, (u8 *)pcmd);
-		pcmd_r = pcmd;
-		break;
 	case GEN_CMD_CODE(_Read_BBREG):
 		read_bbreg_hdl(padapter, (u8 *)pcmd);
 		break;
-- 
Gitee


From 02d00e6394563e4d1722321512e3abe1561d1e57 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:58 +0800
Subject: [PATCH 37/58] HID: check empty report_list in hid_validate_values()

ANBZ: #4539

commit b12fece4c64857e5fab4290bf01b2e0317a88456 upstream.

Add a check for empty report_list in hid_validate_values().
The missing check causes a type confusion when issuing a list_entry()
on an empty report_list.
The problem is caused by the assumption that the device must
have valid report_list. While this will be true for all normal HID
devices, a suitably malicious device can violate the assumption.

Fixes: 1b15d2e5b807 ("HID: core: fix validation of report id 0")
Signed-off-by: Pietro Borrello <borrello@diag.uniroma1.it>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-1073
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1515
---
 drivers/hid/hid-core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index b0c8fae7f903..01010a28332e 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -970,8 +970,8 @@ struct hid_report *hid_validate_values(struct hid_device *hid,
 		 * Validating on id 0 means we should examine the first
 		 * report in the list.
 		 */
-		report = list_entry(
-				hid->report_enum[type].report_list.next,
+		report = list_first_entry_or_null(
+				&hid->report_enum[type].report_list,
 				struct hid_report, list);
 	} else {
 		report = hid->report_enum[type].report_id_hash[id];
-- 
Gitee


From a00fb34e8f2b345b09ada8f809ebd54a640cd87d Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:44:59 +0800
Subject: [PATCH 38/58] phy: tegra: xusb: Fix return value of 
 tegra_xusb_find_port_node function

ANBZ: #4383

commit 045a31b95509c8f25f5f04ec5e0dec5cd09f2c5f upstream.

callers of tegra_xusb_find_port_node() function only do NULL checking for
the return value. return NULL instead of ERR_PTR(-ENOMEM) to keep
consistent.

Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Link: https://lore.kernel.org/r/20211213020507.1458-1-linmq006@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>

Fixes: CVE-2023-23000
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1518
---
 drivers/phy/tegra/xusb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index de1b4ebe4de2..9e8fa1834a15 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -441,7 +441,7 @@ tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
 	name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
 	if (!name) {
 		of_node_put(ports);
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 	}
 	np = of_get_child_by_name(ports, name);
 	kfree(name);
-- 
Gitee


From a3e0333ec992cb8a962ba8b78520d8663ac99938 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:00 +0800
Subject: [PATCH 39/58] power: supply: da9150: Fix use after free bug in 
 da9150_charger_remove due to race condition

ANBZ: #4782

commit 06615d11cc78162dfd5116efb71f29eb29502d37 upstream.

In da9150_charger_probe, &charger->otg_work is bound with
da9150_charger_otg_work. da9150_charger_otg_ncb may be
called to start the work.

If we remove the module which will call da9150_charger_remove
to make cleanup, there may be a unfinished work. The possible
sequence is as follows:

Fix it by canceling the work before cleanup in the da9150_charger_remove

CPU0                  CPUc1

                    |da9150_charger_otg_work
da9150_charger_remove      |
power_supply_unregister  |
device_unregister   |
power_supply_dev_release|
kfree(psy)          |
                    |
                    | 	power_supply_changed(charger->usb);
                    |   //use

Fixes: c1a281e34dae ("power: Add support for DA9150 Charger")
Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-30772
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1576
---
 drivers/power/supply/da9150-charger.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/power/supply/da9150-charger.c b/drivers/power/supply/da9150-charger.c
index 60099815296e..b2d38eb32288 100644
--- a/drivers/power/supply/da9150-charger.c
+++ b/drivers/power/supply/da9150-charger.c
@@ -666,6 +666,7 @@ static int da9150_charger_remove(struct platform_device *pdev)
 
 	if (!IS_ERR_OR_NULL(charger->usb_phy))
 		usb_unregister_notifier(charger->usb_phy, &charger->otg_nb);
+	cancel_work_sync(&charger->otg_work);
 
 	power_supply_unregister(charger->battery);
 	power_supply_unregister(charger->usb);
-- 
Gitee


From e87154da167453f6e673c9e64d1133e0dc4a7819 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:01 +0800
Subject: [PATCH 40/58] Bluetooth: btsdio: fix use after free bug in 
 btsdio_remove due to unfinished work

ANBZ: #4777

commit 1e9ac114c4428fdb7ff4635b45d4f46017e8916f upstream.

In btsdio_probe, &data->work was bound with btsdio_work.In
btsdio_send_frame, it was started by schedule_work.

If we call btsdio_remove with an unfinished job, there may
be a race condition and cause UAF bug on hdev.

Fixes: ddbaf13e3609 ("[Bluetooth] Add generic driver for Bluetooth SDIO devices")
Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>

Fixes: CVE-2023-1989
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1580
---
 drivers/bluetooth/btsdio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
index 20142bc77554..bd55bf7a9914 100644
--- a/drivers/bluetooth/btsdio.c
+++ b/drivers/bluetooth/btsdio.c
@@ -353,6 +353,7 @@ static void btsdio_remove(struct sdio_func *func)
 
 	BT_DBG("func %p", func);
 
+	cancel_work_sync(&data->work);
 	if (!data)
 		return;
 
-- 
Gitee


From 57145fcf130a2133f67209c9298215f94830ca87 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:02 +0800
Subject: [PATCH 41/58] usb: gadget: rndis: prevent integer overflow in 
 rndis_set_response()

ANBZ: #4931

commit 65f3324f4b6fed78b8761c3b74615ecf0ffa81fa upstream.

If "BufOffset" is very large the "BufOffset + 8" operation can have an
integer overflow.

Cc: stable@kernel.org
Fixes: 38ea1eac7d88 ("usb: gadget: rndis: check size of RNDIS_MSG_SET command")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/20220301080424.GA17208@kili
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fixes: CVE-2022-20423
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1733
---
 drivers/usb/gadget/function/rndis.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c
index 6b5848a2c7e9..c2c45208abc5 100644
--- a/drivers/usb/gadget/function/rndis.c
+++ b/drivers/usb/gadget/function/rndis.c
@@ -640,6 +640,7 @@ static int rndis_set_response(struct rndis_params *params,
 	BufLength = le32_to_cpu(buf->InformationBufferLength);
 	BufOffset = le32_to_cpu(buf->InformationBufferOffset);
 	if ((BufLength > RNDIS_MAX_TOTAL_SIZE) ||
+	    (BufOffset > RNDIS_MAX_TOTAL_SIZE) ||
 	    (BufOffset + 8 >= RNDIS_MAX_TOTAL_SIZE))
 		    return -EINVAL;
 
-- 
Gitee


From 72908e4b71e723a4be182e0e63204ca96c825a0c Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:03 +0800
Subject: [PATCH 42/58] drm/i915/gvt: fix double free bug in 
 split_2MB_gtt_entry

ANBZ: #4285

commit 4a61648af68f5ba4884f0e3b494ee1cabc4b6620 upstream.

If intel_gvt_dma_map_guest_page failed, it will call
ppgtt_invalidate_spt, which will finally free the spt.
But the caller function ppgtt_populate_spt_by_guest_entry
does not notice that, it will free spt again in its error
path.

Fix this by canceling the mapping of DMA address and freeing sub_spt.
Besides, leave the handle of spt destroy to caller function instead
of callee function when error occurs.

Fixes: b901b252b6cf ("drm/i915/gvt: Add 2M huge gtt support")
Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Reviewed-by: Zhenyu Wang <zhenyuw@linux.intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20221229165641.1192455-1-zyytlz.wz@163.com
Signed-off-by: Ovidiu Panait <ovidiu.panait@eng.windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fixes: CVE-2022-3707
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1521
---
 drivers/gpu/drm/i915/gvt/gtt.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 40b32b4d1d98..afbc648befec 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1155,10 +1155,8 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
 	for_each_shadow_entry(sub_spt, &sub_se, sub_index) {
 		ret = intel_gvt_hypervisor_dma_map_guest_page(vgpu,
 				start_gfn + sub_index, PAGE_SIZE, &dma_addr);
-		if (ret) {
-			ppgtt_invalidate_spt(spt);
-			return ret;
-		}
+		if (ret)
+			goto err;
 		sub_se.val64 = se->val64;
 
 		/* Copy the PAT field from PDE. */
@@ -1177,6 +1175,17 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
 	ops->set_pfn(se, sub_spt->shadow_page.mfn);
 	ppgtt_set_shadow_entry(spt, se, index);
 	return 0;
+err:
+	/* Cancel the existing addess mappings of DMA addr. */
+	for_each_present_shadow_entry(sub_spt, &sub_se, sub_index) {
+		gvt_vdbg_mm("invalidate 4K entry\n");
+		ppgtt_invalidate_pte(sub_spt, &sub_se);
+	}
+	/* Release the new allocated spt. */
+	trace_spt_change(sub_spt->vgpu->id, "release", sub_spt,
+		sub_spt->guest_page.gfn, sub_spt->shadow_page.type);
+	ppgtt_free_spt(sub_spt);
+	return ret;
 }
 
 static int split_64KB_gtt_entry(struct intel_vgpu *vgpu,
-- 
Gitee


From 0a668f41264ce4c186d2c5428e3604757a292a26 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:04 +0800
Subject: [PATCH 43/58] kernel/relay.c: fix read_pos error when multiple 
 readers

ANBZ: #5684

commit 341a7213e5c1ce274cc0f02270054905800ea660 upstream.

When reading, read_pos should start with bytes_consumed, not file->f_pos.
Because when there is more than one reader, the read_pos corresponding to
file->f_pos may have been consumed, which will cause the data that has
been consumed to be read and the bytes_consumed update error.

Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jann Horn <jannh@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>e
Link: http://lkml.kernel.org/r/1579691175-28949-1-git-send-email-yangpc@wangsu.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Fixes: CVE-2023-3268
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1836
---
 kernel/relay.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/kernel/relay.c b/kernel/relay.c
index 13c19f39e31e..e79c89429ff8 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -996,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf,
 /*
  *	relay_file_read_avail - boolean, are there unconsumed bytes available?
  */
-static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf)
 {
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
 	size_t produced = buf->subbufs_produced;
 	size_t consumed = buf->subbufs_consumed;
 
-	relay_file_read_consume(buf, read_pos, 0);
+	relay_file_read_consume(buf, 0, 0);
 
 	consumed = buf->subbufs_consumed;
 
@@ -1064,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
 
 /**
  *	relay_file_read_start_pos - find the first available byte to read
- *	@read_pos: file read position
  *	@buf: relay channel buffer
  *
- *	If the @read_pos is in the middle of padding, return the
+ *	If the read_pos is in the middle of padding, return the
  *	position of the first actually available byte, otherwise
  *	return the original value.
  */
-static size_t relay_file_read_start_pos(size_t read_pos,
-					struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(struct rchan_buf *buf)
 {
 	size_t read_subbuf, padding, padding_start, padding_end;
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
 	size_t consumed = buf->subbufs_consumed % n_subbufs;
+	size_t read_pos = consumed * subbuf_size + buf->bytes_consumed;
 
-	if (!read_pos)
-		read_pos = consumed * subbuf_size + buf->bytes_consumed;
 	read_subbuf = read_pos / subbuf_size;
 	padding = buf->padding[read_subbuf];
 	padding_start = (read_subbuf + 1) * subbuf_size - padding;
@@ -1136,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp,
 	do {
 		void *from;
 
-		if (!relay_file_read_avail(buf, *ppos))
+		if (!relay_file_read_avail(buf))
 			break;
 
-		read_start = relay_file_read_start_pos(*ppos, buf);
+		read_start = relay_file_read_start_pos(buf);
 		avail = relay_file_read_subbuf_avail(read_start, buf);
 		if (!avail)
 			break;
-- 
Gitee


From deb1fea78a839b9e43f014a8d3865b1067fff9f7 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:05 +0800
Subject: [PATCH 44/58] relayfs: fix out-of-bounds access in relay_file_read

ANBZ: #5684

commit 43ec16f1450f4936025a9bdf1a273affdb9732c1 upstream.

There is a crash in relay_file_read, as the var from
point to the end of last subbuf.

The oops looks something like:
pc : __arch_copy_to_user+0x180/0x310
lr : relay_file_read+0x20c/0x2c8
Call trace:
 __arch_copy_to_user+0x180/0x310
 full_proxy_read+0x68/0x98
 vfs_read+0xb0/0x1d0
 ksys_read+0x6c/0xf0
 __arm64_sys_read+0x20/0x28
 el0_svc_common.constprop.3+0x84/0x108
 do_el0_svc+0x74/0x90
 el0_svc+0x1c/0x28
 el0_sync_handler+0x88/0xb0
 el0_sync+0x148/0x180

We get the condition by analyzing the vmcore:

1). The last produced byte and last consumed byte
    both at the end of the last subbuf

2). A softirq calls function(e.g __blk_add_trace)
    to write relay buffer occurs when an program is calling
    relay_file_read_avail().

        relay_file_read
                relay_file_read_avail
                        relay_file_read_consume(buf, 0, 0);
                        //interrupted by softirq who will write subbuf
                        ....
                        return 1;
                //read_start point to the end of the last subbuf
                read_start = relay_file_read_start_pos
                //avail is equal to subsize
                avail = relay_file_read_subbuf_avail
                //from  points to an invalid memory address
                from = buf->start + read_start
                //system is crashed
                copy_to_user(buffer, from, avail)

Link: https://lkml.kernel.org/r/20230419040203.37676-1-zhang.zhengming@h3c.com
Fixes: 8d62fdebdaf9 ("relay file read: start-pos fix")
Signed-off-by: Zhang Zhengming <zhang.zhengming@h3c.com>
Reviewed-by: Zhao Lei <zhao_lei1@hoperun.com>
Reviewed-by: Zhou Kete <zhou.kete@h3c.com>
Reviewed-by: Pengcheng Yang <yangpc@wangsu.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Fixes: CVE-2023-3268
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1836
---
 kernel/relay.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/relay.c b/kernel/relay.c
index e79c89429ff8..b8703edb50dd 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1076,7 +1076,8 @@ static size_t relay_file_read_start_pos(struct rchan_buf *buf)
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
 	size_t consumed = buf->subbufs_consumed % n_subbufs;
-	size_t read_pos = consumed * subbuf_size + buf->bytes_consumed;
+	size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed)
+			% (n_subbufs * subbuf_size);
 
 	read_subbuf = read_pos / subbuf_size;
 	padding = buf->padding[read_subbuf];
-- 
Gitee


From 983a3ff6ea009e66f89f2a96af6a8d9b519ba89d Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:05 +0800
Subject: [PATCH 45/58] bpf/verifier: display non-spill stack slot types in 
 print_verifier_state

ANBZ: #5530

commit 8efea21d333d21e1f9177579ffdc69556314f603 upstream

If a stack slot does not hold a spilled register (STACK_SPILL), then each
 of its eight bytes could potentially have a different slot_type.  This
 information can be important for debugging, and previously we either did
 not print anything for the stack slot, or just printed fp-X=0 in the case
 where its first byte was STACK_ZERO.
Instead, print eight characters with either 0 (STACK_ZERO), m (STACK_MISC)
 or ? (STACK_INVALID) for any stack slot which is neither STACK_SPILL nor
 entirely STACK_INVALID.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9b74115452c6..5ef094796925 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -263,6 +263,13 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
+static char slot_type_char[] = {
+	[STACK_INVALID]	= '?',
+	[STACK_SPILL]	= 'r',
+	[STACK_MISC]	= 'm',
+	[STACK_ZERO]	= '0',
+};
+
 static void print_liveness(struct bpf_verifier_env *env,
 			   enum bpf_reg_liveness live)
 {
@@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		}
 	}
 	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-		if (state->stack[i].slot_type[0] == STACK_SPILL) {
-			verbose(env, " fp%d",
-				(-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, state->stack[i].spilled_ptr.live);
+		char types_buf[BPF_REG_SIZE + 1];
+		bool valid = false;
+		int j;
+
+		for (j = 0; j < BPF_REG_SIZE; j++) {
+			if (state->stack[i].slot_type[j] != STACK_INVALID)
+				valid = true;
+			types_buf[j] = slot_type_char[
+					state->stack[i].slot_type[j]];
+		}
+		types_buf[BPF_REG_SIZE] = 0;
+		if (!valid)
+			continue;
+		verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+		print_liveness(env, state->stack[i].spilled_ptr.live);
+		if (state->stack[i].slot_type[0] == STACK_SPILL)
 			verbose(env, "=%s",
 				reg_type_str[state->stack[i].spilled_ptr.type]);
-		}
-		if (state->stack[i].slot_type[0] == STACK_ZERO)
-			verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
+		else
+			verbose(env, "=%s", types_buf);
 	}
 	verbose(env, "\n");
 }
-- 
Gitee


From 9ff6b18220a7351c67a00750f5892eb22aeeda72 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:07 +0800
Subject: [PATCH 46/58] tools/bpf: move bpf/lib netlink related functions into 
 a new file

ANBZ: #5530

commit f7010770fbac47b1fc9fb723b1d2019eb23c04f2 upstream

There are no functionality change for this patch.

In the subsequent patches, more netlink related library functions
will be added and a separate file is better than cluttering bpf.c.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/lib/bpf/Build     |   2 +-
 tools/lib/bpf/bpf.c     | 129 -------------------------------
 tools/lib/bpf/netlink.c | 165 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 130 deletions(-)
 create mode 100644 tools/lib/bpf/netlink.c

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 6eb9bacd1948..7bc31c905018 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 482025b72839..eba3ad604244 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -28,16 +28,8 @@
 #include <linux/bpf.h>
 #include "bpf.h"
 #include "libbpf.h"
-#include "nlattr.h"
-#include <linux/rtnetlink.h>
-#include <linux/if_link.h>
-#include <sys/socket.h>
 #include <errno.h>
 
-#ifndef SOL_NETLINK
-#define SOL_NETLINK 270
-#endif
-
 /*
  * When building perf, unistd.h is overridden. __NR_bpf is
  * required to be defined explicitly.
@@ -521,127 +513,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 	return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
 }
 
-int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
-{
-	struct sockaddr_nl sa;
-	int sock, seq = 0, len, ret = -1;
-	char buf[4096];
-	struct nlattr *nla, *nla_xdp;
-	struct {
-		struct nlmsghdr  nh;
-		struct ifinfomsg ifinfo;
-		char             attrbuf[64];
-	} req;
-	struct nlmsghdr *nh;
-	struct nlmsgerr *err;
-	socklen_t addrlen;
-	int one = 1;
-
-	memset(&sa, 0, sizeof(sa));
-	sa.nl_family = AF_NETLINK;
-
-	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	if (sock < 0) {
-		return -errno;
-	}
-
-	if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
-		       &one, sizeof(one)) < 0) {
-		fprintf(stderr, "Netlink error reporting not supported\n");
-	}
-
-	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	addrlen = sizeof(sa);
-	if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	if (addrlen != sizeof(sa)) {
-		ret = -LIBBPF_ERRNO__INTERNAL;
-		goto cleanup;
-	}
-
-	memset(&req, 0, sizeof(req));
-	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-	req.nh.nlmsg_type = RTM_SETLINK;
-	req.nh.nlmsg_pid = 0;
-	req.nh.nlmsg_seq = ++seq;
-	req.ifinfo.ifi_family = AF_UNSPEC;
-	req.ifinfo.ifi_index = ifindex;
-
-	/* started nested attribute for XDP */
-	nla = (struct nlattr *)(((char *)&req)
-				+ NLMSG_ALIGN(req.nh.nlmsg_len));
-	nla->nla_type = NLA_F_NESTED | IFLA_XDP;
-	nla->nla_len = NLA_HDRLEN;
-
-	/* add XDP fd */
-	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
-	nla_xdp->nla_type = IFLA_XDP_FD;
-	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
-	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
-	nla->nla_len += nla_xdp->nla_len;
-
-	/* if user passed in any flags, add those too */
-	if (flags) {
-		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
-		nla_xdp->nla_type = IFLA_XDP_FLAGS;
-		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
-		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
-		nla->nla_len += nla_xdp->nla_len;
-	}
-
-	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
-
-	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	len = recv(sock, buf, sizeof(buf), 0);
-	if (len < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
-	     nh = NLMSG_NEXT(nh, len)) {
-		if (nh->nlmsg_pid != sa.nl_pid) {
-			ret = -LIBBPF_ERRNO__WRNGPID;
-			goto cleanup;
-		}
-		if (nh->nlmsg_seq != seq) {
-			ret = -LIBBPF_ERRNO__INVSEQ;
-			goto cleanup;
-		}
-		switch (nh->nlmsg_type) {
-		case NLMSG_ERROR:
-			err = (struct nlmsgerr *)NLMSG_DATA(nh);
-			if (!err->error)
-				continue;
-			ret = err->error;
-			nla_dump_errormsg(nh);
-			goto cleanup;
-		case NLMSG_DONE:
-			break;
-		default:
-			break;
-		}
-	}
-
-	ret = 0;
-
-cleanup:
-	close(sock);
-	return ret;
-}
-
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 		 bool do_log)
 {
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
new file mode 100644
index 000000000000..ccaa991fe9d8
--- /dev/null
+++ b/tools/lib/bpf/netlink.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: LGPL-2.1
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+#include <errno.h>
+#include <time.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "nlattr.h"
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+static int bpf_netlink_open(__u32 *nl_pid)
+{
+	struct sockaddr_nl sa;
+	socklen_t addrlen;
+	int one = 1, ret;
+	int sock;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0)
+		return -errno;
+
+	if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+		       &one, sizeof(one)) < 0) {
+		fprintf(stderr, "Netlink error reporting not supported\n");
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	addrlen = sizeof(sa);
+	if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	if (addrlen != sizeof(sa)) {
+		ret = -LIBBPF_ERRNO__INTERNAL;
+		goto cleanup;
+	}
+
+	*nl_pid = sa.nl_pid;
+	return sock;
+
+cleanup:
+	close(sock);
+	return ret;
+}
+
+static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq)
+{
+	struct nlmsgerr *err;
+	struct nlmsghdr *nh;
+	char buf[4096];
+	int len, ret;
+
+	while (1) {
+		len = recv(sock, buf, sizeof(buf), 0);
+		if (len < 0) {
+			ret = -errno;
+			goto done;
+		}
+
+		for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+		     nh = NLMSG_NEXT(nh, len)) {
+			if (nh->nlmsg_pid != nl_pid) {
+				ret = -LIBBPF_ERRNO__WRNGPID;
+				goto done;
+			}
+			if (nh->nlmsg_seq != seq) {
+				ret = -LIBBPF_ERRNO__INVSEQ;
+				goto done;
+			}
+			switch (nh->nlmsg_type) {
+			case NLMSG_ERROR:
+				err = (struct nlmsgerr *)NLMSG_DATA(nh);
+				if (!err->error)
+					continue;
+				ret = err->error;
+				nla_dump_errormsg(nh);
+				goto done;
+			case NLMSG_DONE:
+				return 0;
+			default:
+				break;
+			}
+		}
+	}
+	ret = 0;
+done:
+	return ret;
+}
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
+{
+	int sock, seq = 0, ret;
+	struct nlattr *nla, *nla_xdp;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	__u32 nl_pid;
+
+	sock = bpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return sock;
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+
+	/* started nested attribute for XDP */
+	nla = (struct nlattr *)(((char *)&req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+	nla->nla_len = NLA_HDRLEN;
+
+	/* add XDP fd */
+	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+	nla_xdp->nla_type = IFLA_XDP_FD;
+	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+	nla->nla_len += nla_xdp->nla_len;
+
+	/* if user passed in any flags, add those too */
+	if (flags) {
+		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+		nla_xdp->nla_type = IFLA_XDP_FLAGS;
+		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+		nla->nla_len += nla_xdp->nla_len;
+	}
+
+	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+	ret = bpf_netlink_recv(sock, nl_pid, seq);
+
+cleanup:
+	close(sock);
+	return ret;
+}
-- 
Gitee


From c2c5e66ef5535197e3180fc2bd9932dbd60cb5c9 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:08 +0800
Subject: [PATCH 47/58] tools/bpf: add more netlink functionalities in lib/bpf

ANBZ: #5530

commit 36f1678d9e0b5d2e0236046d9659e0348b4719a8 upstream

This patch added a few netlink attribute parsing functions
and the netlink API functions to query networking links, tc classes,
tc qdiscs and tc filters. For example, the following API is
to get networking links:
  int nl_get_link(int sock, unsigned int nl_pid,
                  dump_nlmsg_t dump_link_nlmsg,
                  void *cookie);

Note that when the API is called, the user also provided a
callback function with the following signature:
  int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);

The "cookie" is the parameter the user passed to the API and will
be available for the callback function.
The "msg" is the information about the result, e.g., ifinfomsg or
tcmsg. The "tb" is the parsed netlink attributes.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/lib/bpf/libbpf.h       |  16 ++++
 tools/lib/bpf/libbpf_errno.c |   1 +
 tools/lib/bpf/netlink.c      | 165 ++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/nlattr.c       |  33 ++++---
 tools/lib/bpf/nlattr.h       |  38 ++++++++
 5 files changed, 238 insertions(+), 15 deletions(-)

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 96c55fac54c3..e3b00e23e181 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -46,6 +46,7 @@ enum libbpf_errno {
 	LIBBPF_ERRNO__PROGTYPE,	/* Kernel doesn't support this program type */
 	LIBBPF_ERRNO__WRNGPID,	/* Wrong pid in netlink message */
 	LIBBPF_ERRNO__INVSEQ,	/* Invalid netlink sequence */
+	LIBBPF_ERRNO__NLPARSE,	/* netlink parsing error */
 	__LIBBPF_ERRNO__END,
 };
 
@@ -297,4 +298,19 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size,
 			       unsigned long page_size,
 			       void **buf, size_t *buf_len,
 			       bpf_perf_event_print_t fn, void *priv);
+
+struct nlmsghdr;
+struct nlattr;
+typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
+typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t,
+			      void *cookie);
+int bpf_netlink_open(unsigned int *nl_pid);
+int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
+		void *cookie);
+int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_class_nlmsg, void *cookie);
+int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
+int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+		  dump_nlmsg_t dump_filter_nlmsg, void *cookie);
 #endif
diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c
index d2d17226d9d6..451283328fa9 100644
--- a/tools/lib/bpf/libbpf_errno.c
+++ b/tools/lib/bpf/libbpf_errno.c
@@ -43,6 +43,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(PROGTYPE)]	= "Kernel doesn't support this program type",
 	[ERRCODE_OFFSET(WRNGPID)]	= "Wrong pid in netlink message",
 	[ERRCODE_OFFSET(INVSEQ)]	= "Invalid netlink sequence",
+	[ERRCODE_OFFSET(NLPARSE)]	= "Incorrect netlink message parsing",
 };
 
 int libbpf_strerror(int err, char *buf, size_t size)
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index ccaa991fe9d8..469e068dd0c5 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -18,7 +18,7 @@
 #define SOL_NETLINK 270
 #endif
 
-static int bpf_netlink_open(__u32 *nl_pid)
+int bpf_netlink_open(__u32 *nl_pid)
 {
 	struct sockaddr_nl sa;
 	socklen_t addrlen;
@@ -61,7 +61,9 @@ static int bpf_netlink_open(__u32 *nl_pid)
 	return ret;
 }
 
-static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq)
+static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
+			    __dump_nlmsg_t _fn, dump_nlmsg_t fn,
+			    void *cookie)
 {
 	struct nlmsgerr *err;
 	struct nlmsghdr *nh;
@@ -98,6 +100,11 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq)
 			default:
 				break;
 			}
+			if (_fn) {
+				ret = _fn(nh, fn, cookie);
+				if (ret)
+					return ret;
+			}
 		}
 	}
 	ret = 0;
@@ -157,9 +164,161 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 		ret = -errno;
 		goto cleanup;
 	}
-	ret = bpf_netlink_recv(sock, nl_pid, seq);
+	ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);
 
 cleanup:
 	close(sock);
 	return ret;
 }
+
+static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg,
+			     void *cookie)
+{
+	struct nlattr *tb[IFLA_MAX + 1], *attr;
+	struct ifinfomsg *ifi = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+	attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi)));
+	if (nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_link_nlmsg(cookie, ifi, tb);
+}
+
+int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
+		void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct ifinfomsg ifm;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+		.nlh.nlmsg_type = RTM_GETLINK,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.ifm.ifi_family = AF_PACKET,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg,
+				dump_link_nlmsg, cookie);
+}
+
+static int __dump_class_nlmsg(struct nlmsghdr *nlh,
+			      dump_nlmsg_t dump_class_nlmsg, void *cookie)
+{
+	struct nlattr *tb[TCA_MAX + 1], *attr;
+	struct tcmsg *t = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_class_nlmsg(cookie, t, tb);
+}
+
+int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_class_nlmsg, void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct tcmsg t;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nlh.nlmsg_type = RTM_GETTCLASS,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.t.tcm_family = AF_UNSPEC,
+		.t.tcm_ifindex = ifindex,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg,
+				dump_class_nlmsg, cookie);
+}
+
+static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh,
+			      dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+{
+	struct nlattr *tb[TCA_MAX + 1], *attr;
+	struct tcmsg *t = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_qdisc_nlmsg(cookie, t, tb);
+}
+
+int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct tcmsg t;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nlh.nlmsg_type = RTM_GETQDISC,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.t.tcm_family = AF_UNSPEC,
+		.t.tcm_ifindex = ifindex,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg,
+				dump_qdisc_nlmsg, cookie);
+}
+
+static int __dump_filter_nlmsg(struct nlmsghdr *nlh,
+			       dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+{
+	struct nlattr *tb[TCA_MAX + 1], *attr;
+	struct tcmsg *t = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_filter_nlmsg(cookie, t, tb);
+}
+
+int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+		  dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct tcmsg t;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nlh.nlmsg_type = RTM_GETTFILTER,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.t.tcm_family = AF_UNSPEC,
+		.t.tcm_ifindex = ifindex,
+		.t.tcm_parent = handle,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg,
+				dump_filter_nlmsg, cookie);
+}
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
index 4719434278b2..49f514119bdb 100644
--- a/tools/lib/bpf/nlattr.c
+++ b/tools/lib/bpf/nlattr.c
@@ -26,11 +26,6 @@ static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = {
 	[NLA_FLAG]	= 0,
 };
 
-static int nla_len(const struct nlattr *nla)
-{
-	return nla->nla_len - NLA_HDRLEN;
-}
-
 static struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
 {
 	int totlen = NLA_ALIGN(nla->nla_len);
@@ -46,11 +41,6 @@ static int nla_ok(const struct nlattr *nla, int remaining)
 	       nla->nla_len <= remaining;
 }
 
-static void *nla_data(const struct nlattr *nla)
-{
-	return (char *) nla + NLA_HDRLEN;
-}
-
 static int nla_type(const struct nlattr *nla)
 {
 	return nla->nla_type & NLA_TYPE_MASK;
@@ -114,8 +104,8 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh)
  * @see nla_validate
  * @return 0 on success or a negative error code.
  */
-static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
-		     struct nla_policy *policy)
+int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+	      struct nla_policy *policy)
 {
 	struct nlattr *nla;
 	int rem, err;
@@ -146,6 +136,25 @@ static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int
 	return err;
 }
 
+/**
+ * Create attribute index based on nested attribute
+ * @arg tb              Index array to be filled (maxtype+1 elements).
+ * @arg maxtype         Maximum attribute type expected and accepted.
+ * @arg nla             Nested Attribute.
+ * @arg policy          Attribute validation policy.
+ *
+ * Feeds the stream of attributes nested into the specified attribute
+ * to nla_parse().
+ *
+ * @see nla_parse
+ * @return 0 on success or a negative error code.
+ */
+int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		     struct nla_policy *policy)
+{
+	return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy);
+}
+
 /* dump netlink extended ack error message */
 int nla_dump_errormsg(struct nlmsghdr *nlh)
 {
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
index 931a71f68f93..a6e2396bce7c 100644
--- a/tools/lib/bpf/nlattr.h
+++ b/tools/lib/bpf/nlattr.h
@@ -67,6 +67,44 @@ struct nla_policy {
 	     nla_ok(pos, rem); \
 	     pos = nla_next(pos, &(rem)))
 
+/**
+ * nla_data - head of payload
+ * @nla: netlink attribute
+ */
+static inline void *nla_data(const struct nlattr *nla)
+{
+	return (char *) nla + NLA_HDRLEN;
+}
+
+static inline uint8_t nla_getattr_u8(const struct nlattr *nla)
+{
+	return *(uint8_t *)nla_data(nla);
+}
+
+static inline uint32_t nla_getattr_u32(const struct nlattr *nla)
+{
+	return *(uint32_t *)nla_data(nla);
+}
+
+static inline const char *nla_getattr_str(const struct nlattr *nla)
+{
+	return (const char *)nla_data(nla);
+}
+
+/**
+ * nla_len - length of payload
+ * @nla: netlink attribute
+ */
+static inline int nla_len(const struct nlattr *nla)
+{
+	return nla->nla_len - NLA_HDRLEN;
+}
+
+int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+	      struct nla_policy *policy);
+int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		     struct nla_policy *policy);
+
 int nla_dump_errormsg(struct nlmsghdr *nlh);
 
 #endif /* __NLATTR_H */
-- 
Gitee


From 37c3d5ff798df8e2809412109e6ba279bf265781 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:09 +0800
Subject: [PATCH 48/58] tools/bpf: bpftool: add net support

ANBZ: #5530

commit f6f3bac08ff9855d803081a353a1fafaa8845739 upstream

Add "bpftool net" support. Networking devices are enumerated
to dump device index/name associated with xdp progs.

For each networking device, tc classes and qdiscs are enumerated
in order to check their bpf filters.
In addition, root handle and clsact ingress/egress are also checked for
bpf filters.  Not all filter information is printed out. Only ifindex,
kind, filter name, prog_id and tag are printed out, which are good
enough to show attachment information. If the filter action
is a bpf action, its bpf program id, bpf name and tag will be
printed out as well.

For example,
  $ ./bpftool net
  xdp [
  ifindex 2 devname eth0 prog_id 198
  ]
  tc_filters [
  ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb]
            prog_id 111727 tag d08fe3b4319bc2fd act []
  ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp
            prog_id 130246 tag 3f265c7f26db62c9 act []
  ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact]
            prog_id 111726 tag 99a197826974c876
  ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp
            prog_id 108619 tag dc4630674fd72dcc act []
  ifindex 2 kind qdisc_clsact_egress name fbflow_egress
            prog_id 130245 tag 72d2d830d6888d2c
  ]
  $ ./bpftool -jp net
  [{
        "xdp": [{
                "ifindex": 2,
                "devname": "eth0",
                "prog_id": 198
            }
        ],
        "tc_filters": [{
                "ifindex": 2,
                "kind": "qdisc_htb",
                "name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
                "prog_id": 111727,
                "tag": "d08fe3b4319bc2fd",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_ingress",
                "name": "fbflow_icmp",
                "prog_id": 130246,
                "tag": "3f265c7f26db62c9",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
                "prog_id": 111726,
                "tag": "99a197826974c876"
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "cls_fg_dscp",
                "prog_id": 108619,
                "tag": "dc4630674fd72dcc",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "fbflow_egress",
                "prog_id": 130245,
                "tag": "72d2d830d6888d2c"
            }
        ]
    }
  ]

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 .../bpf/bpftool/Documentation/bpftool-net.rst | 133 ++++++++++
 tools/bpf/bpftool/Documentation/bpftool.rst   |   6 +-
 tools/bpf/bpftool/bash-completion/bpftool     |  17 +-
 tools/bpf/bpftool/main.c                      |   3 +-
 tools/bpf/bpftool/main.h                      |   7 +
 tools/bpf/bpftool/net.c                       | 233 ++++++++++++++++++
 tools/bpf/bpftool/netlink_dumper.c            | 181 ++++++++++++++
 tools/bpf/bpftool/netlink_dumper.h            | 103 ++++++++
 8 files changed, 676 insertions(+), 7 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-net.rst
 create mode 100644 tools/bpf/bpftool/net.c
 create mode 100644 tools/bpf/bpftool/netlink_dumper.c
 create mode 100644 tools/bpf/bpftool/netlink_dumper.h

diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
new file mode 100644
index 000000000000..48a61837a264
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -0,0 +1,133 @@
+================
+bpftool-net
+================
+-------------------------------------------------------------------------------
+tool for inspection of netdev/tc related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **net** *COMMAND*
+
+	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+	*COMMANDS* :=
+	{ **show** | **list** } [ **dev** name ] | **help**
+
+NET COMMANDS
+============
+
+|	**bpftool** **net { show | list } [ dev name ]**
+|	**bpftool** **net help**
+
+DESCRIPTION
+===========
+	**bpftool net { show | list } [ dev name ]**
+		  List all networking device driver and tc attachment in the system.
+
+                  Output will start with all xdp program attachment, followed by
+                  all tc class/qdisc bpf program attachments. Both xdp programs and
+                  tc programs are ordered based on ifindex number. If multiple bpf
+                  programs attached to the same networking device through **tc filter**,
+                  the order will be first all bpf programs attached to tc classes, then
+                  all bpf programs attached to non clsact qdiscs, and finally all
+                  bpf programs attached to root and clsact qdisc.
+
+	**bpftool net help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-v, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool net**
+
+::
+
+      xdp [
+      ifindex 2 devname eth0 prog_id 198
+      ]
+      tc_filters [
+      ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb]
+                prog_id 111727 tag d08fe3b4319bc2fd act []
+      ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp
+                prog_id 130246 tag 3f265c7f26db62c9 act []
+      ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact]
+                prog_id 111726 tag 99a197826974c876
+      ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp
+                prog_id 108619 tag dc4630674fd72dcc act []
+      ifindex 2 kind qdisc_clsact_egress name fbflow_egress
+                prog_id 130245 tag 72d2d830d6888d2c
+      ]
+
+|
+| **# bpftool -jp net**
+
+::
+
+    [{
+            "xdp": [{
+                    "ifindex": 2,
+                    "devname": "eth0",
+                    "prog_id": 198
+                }
+            ],
+            "tc_filters": [{
+                    "ifindex": 2,
+                    "kind": "qdisc_htb",
+                    "name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
+                    "prog_id": 111727,
+                    "tag": "d08fe3b4319bc2fd",
+                    "act": []
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_ingress",
+                    "name": "fbflow_icmp",
+                    "prog_id": 130246,
+                    "tag": "3f265c7f26db62c9",
+                    "act": []
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_egress",
+                    "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
+                    "prog_id": 111726,
+                    "tag": "99a197826974c876"
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_egress",
+                    "name": "cls_fg_dscp",
+                    "prog_id": 108619,
+                    "tag": "dc4630674fd72dcc",
+                    "act": []
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_egress",
+                    "name": "fbflow_egress",
+                    "prog_id": 130245,
+                    "tag": "72d2d830d6888d2c"
+                }
+            ]
+        }
+    ]
+
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index b6f5d560460d..8dda77daeda9 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **cgroup** | **perf** }
+	*OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** }
 
 	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -32,6 +32,8 @@ SYNOPSIS
 
 	*PERF-COMMANDS* := { **show** | **list** | **help** }
 
+	*NET-COMMANDS* := { **show** | **list** | **help** }
+
 DESCRIPTION
 ===========
 	*bpftool* allows for inspection and simple modification of BPF objects
@@ -58,4 +60,4 @@ OPTIONS
 SEE ALSO
 ========
 	**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
-        **bpftool-perf**\ (8)
+        **bpftool-perf**\ (8), **bpftool-net**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index c2b6b2176f3b..ec577af2e5f0 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -494,10 +494,10 @@ _bpftool()
                     _filedir
                     return 0
                     ;;
-		tree)
-		    _filedir
-		    return 0
-		    ;;
+                tree)
+                    _filedir
+                    return 0
+                    ;;
                 attach|detach)
                     local ATTACH_TYPES='ingress egress sock_create sock_ops \
                         device bind4 bind6 post_bind4 post_bind6 connect4 \
@@ -552,6 +552,15 @@ _bpftool()
                     ;;
             esac
             ;;
+        net)
+            case $command in
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index d15a62be6cf0..79dc3f193547 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -85,7 +85,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup | perf }\n"
+		"       OBJECT := { prog | map | cgroup | perf | net }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -215,6 +215,7 @@ static const struct cmd cmds[] = {
 	{ "map",	do_map },
 	{ "cgroup",	do_cgroup },
 	{ "perf",	do_perf },
+	{ "net",	do_net },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 057a227bdb9f..0a9d2779635c 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -136,6 +136,7 @@ int do_map(int argc, char **arg);
 int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
 int do_perf(int argc, char **arg);
+int do_net(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd(int *argc, char ***argv);
@@ -165,4 +166,10 @@ struct btf_dumper {
  */
 int btf_dumper_type(const struct btf_dumper *d, __u32 type_id,
 		    const void *data);
+
+struct nlattr;
+struct ifinfomsg;
+struct tcmsg;
+int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb);
+int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind);
 #endif
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
new file mode 100644
index 000000000000..77dd73dd9ade
--- /dev/null
+++ b/tools/bpf/bpftool/net.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libbpf.h>
+#include <net/if.h>
+#include <linux/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
+#include <sys/socket.h>
+
+#include <bpf.h>
+#include <nlattr.h>
+#include "main.h"
+#include "netlink_dumper.h"
+
+struct bpf_netdev_t {
+	int	*ifindex_array;
+	int	used_len;
+	int	array_len;
+	int	filter_idx;
+};
+
+struct tc_kind_handle {
+	char	kind[64];
+	int	handle;
+};
+
+struct bpf_tcinfo_t {
+	struct tc_kind_handle	*handle_array;
+	int			used_len;
+	int			array_len;
+	bool			is_qdisc;
+};
+
+static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct bpf_netdev_t *netinfo = cookie;
+	struct ifinfomsg *ifinfo = msg;
+
+	if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index)
+		return 0;
+
+	if (netinfo->used_len == netinfo->array_len) {
+		netinfo->ifindex_array = realloc(netinfo->ifindex_array,
+			(netinfo->array_len + 16) * sizeof(int));
+		netinfo->array_len += 16;
+	}
+	netinfo->ifindex_array[netinfo->used_len++] = ifinfo->ifi_index;
+
+	return do_xdp_dump(ifinfo, tb);
+}
+
+static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct bpf_tcinfo_t *tcinfo = cookie;
+	struct tcmsg *info = msg;
+
+	if (tcinfo->is_qdisc) {
+		/* skip clsact qdisc */
+		if (tb[TCA_KIND] &&
+		    strcmp(nla_data(tb[TCA_KIND]), "clsact") == 0)
+			return 0;
+		if (info->tcm_handle == 0)
+			return 0;
+	}
+
+	if (tcinfo->used_len == tcinfo->array_len) {
+		tcinfo->handle_array = realloc(tcinfo->handle_array,
+			(tcinfo->array_len + 16) * sizeof(struct tc_kind_handle));
+		tcinfo->array_len += 16;
+	}
+	tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle;
+	snprintf(tcinfo->handle_array[tcinfo->used_len].kind,
+		 sizeof(tcinfo->handle_array[tcinfo->used_len].kind),
+		 "%s_%s",
+		 tcinfo->is_qdisc ? "qdisc" : "class",
+		 tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown");
+	tcinfo->used_len++;
+
+	return 0;
+}
+
+static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+	const char *kind = cookie;
+
+	return do_filter_dump((struct tcmsg *)msg, tb, kind);
+}
+
+static int show_dev_tc_bpf(int sock, unsigned int nl_pid, int ifindex)
+{
+	struct bpf_tcinfo_t tcinfo;
+	int i, handle, ret;
+
+	tcinfo.handle_array = NULL;
+	tcinfo.used_len = 0;
+	tcinfo.array_len = 0;
+
+	tcinfo.is_qdisc = false;
+	ret = nl_get_class(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg,
+			   &tcinfo);
+	if (ret)
+		return ret;
+
+	tcinfo.is_qdisc = true;
+	ret = nl_get_qdisc(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg,
+			   &tcinfo);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < tcinfo.used_len; i++) {
+		ret = nl_get_filter(sock, nl_pid, ifindex,
+				    tcinfo.handle_array[i].handle,
+				    dump_filter_nlmsg,
+				    tcinfo.handle_array[i].kind);
+		if (ret)
+			return ret;
+	}
+
+	/* root, ingress and egress handle */
+	handle = TC_H_ROOT;
+	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
+			    "root");
+	if (ret)
+		return ret;
+
+	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
+	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
+			    "qdisc_clsact_ingress");
+	if (ret)
+		return ret;
+
+	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
+	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
+			    "qdisc_clsact_egress");
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+	int i, sock, ret, filter_idx = -1;
+	struct bpf_netdev_t dev_array;
+	unsigned int nl_pid;
+	char err_buf[256];
+
+	if (argc == 2) {
+		if (strcmp(argv[0], "dev") != 0)
+			usage();
+		filter_idx = if_nametoindex(argv[1]);
+		if (filter_idx == 0) {
+			fprintf(stderr, "invalid dev name %s\n", argv[1]);
+			return -1;
+		}
+	} else if (argc != 0) {
+		usage();
+	}
+
+	sock = bpf_netlink_open(&nl_pid);
+	if (sock < 0) {
+		fprintf(stderr, "failed to open netlink sock\n");
+		return -1;
+	}
+
+	dev_array.ifindex_array = NULL;
+	dev_array.used_len = 0;
+	dev_array.array_len = 0;
+	dev_array.filter_idx = filter_idx;
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+	NET_START_OBJECT;
+	NET_START_ARRAY("xdp", "\n");
+	ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array);
+	NET_END_ARRAY("\n");
+
+	if (!ret) {
+		NET_START_ARRAY("tc_filters", "\n");
+		for (i = 0; i < dev_array.used_len; i++) {
+			ret = show_dev_tc_bpf(sock, nl_pid,
+					      dev_array.ifindex_array[i]);
+			if (ret)
+				break;
+		}
+		NET_END_ARRAY("\n");
+	}
+	NET_END_OBJECT;
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	if (ret) {
+		if (json_output)
+			jsonw_null(json_wtr);
+		libbpf_strerror(ret, err_buf, sizeof(err_buf));
+		fprintf(stderr, "Error: %s\n", err_buf);
+	}
+	free(dev_array.ifindex_array);
+	close(sock);
+	return ret;
+}
+
+static int do_help(int argc, char **argv)
+{
+	if (json_output) {
+		jsonw_null(json_wtr);
+		return 0;
+	}
+
+	fprintf(stderr,
+		"Usage: %s %s { show | list } [dev <devname>]\n"
+		"       %s %s help\n",
+		bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "list",	do_show },
+	{ "help",	do_help },
+	{ 0 }
+};
+
+int do_net(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c
new file mode 100644
index 000000000000..e12494fd1d2e
--- /dev/null
+++ b/tools/bpf/bpftool/netlink_dumper.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#include <stdlib.h>
+#include <string.h>
+#include <libbpf.h>
+#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
+
+#include <nlattr.h>
+#include "main.h"
+#include "netlink_dumper.h"
+
+static void xdp_dump_prog_id(struct nlattr **tb, int attr,
+			     const char *type)
+{
+	if (!tb[attr])
+		return;
+
+	NET_DUMP_UINT(type, nla_getattr_u32(tb[attr]))
+}
+
+static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex,
+			   const char *name)
+{
+	struct nlattr *tb[IFLA_XDP_MAX + 1];
+	unsigned char mode;
+
+	if (nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0)
+		return -1;
+
+	if (!tb[IFLA_XDP_ATTACHED])
+		return 0;
+
+	mode = nla_getattr_u8(tb[IFLA_XDP_ATTACHED]);
+	if (mode == XDP_ATTACHED_NONE)
+		return 0;
+
+	NET_START_OBJECT;
+	NET_DUMP_UINT("ifindex", ifindex);
+
+	if (name)
+		NET_DUMP_STR("devname", name);
+
+	if (tb[IFLA_XDP_PROG_ID])
+		NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[IFLA_XDP_PROG_ID]));
+
+	if (mode == XDP_ATTACHED_MULTI) {
+		xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic_prog_id");
+		xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "drv_prog_id");
+		xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload_prog_id");
+	}
+
+	NET_END_OBJECT_FINAL;
+	return 0;
+}
+
+int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb)
+{
+	if (!tb[IFLA_XDP])
+		return 0;
+
+	return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index,
+			       nla_getattr_str(tb[IFLA_IFNAME]));
+}
+
+static char *hexstring_n2a(const unsigned char *str, int len,
+			   char *buf, int blen)
+{
+	char *ptr = buf;
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (blen < 3)
+			break;
+		sprintf(ptr, "%02x", str[i]);
+		ptr += 2;
+		blen -= 2;
+	}
+	return buf;
+}
+
+static int do_bpf_dump_one_act(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+	char buf[256];
+
+	if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	if (!tb[TCA_ACT_BPF_PARMS])
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	NET_START_OBJECT_NESTED2;
+	if (tb[TCA_ACT_BPF_NAME])
+		NET_DUMP_STR("name", nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
+	if (tb[TCA_ACT_BPF_ID])
+		NET_DUMP_UINT("bpf_id", nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
+	if (tb[TCA_ACT_BPF_TAG])
+		NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_ACT_BPF_TAG]),
+						  nla_len(tb[TCA_ACT_BPF_TAG]),
+						  buf, sizeof(buf)));
+	NET_END_OBJECT_NESTED;
+	return 0;
+}
+
+static int do_dump_one_act(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_ACT_MAX + 1];
+
+	if (!attr)
+		return 0;
+
+	if (nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	if (tb[TCA_ACT_KIND] && strcmp(nla_data(tb[TCA_ACT_KIND]), "bpf") == 0)
+		return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]);
+
+	return 0;
+}
+
+static int do_bpf_act_dump(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	int act, ret;
+
+	if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	NET_START_ARRAY("act", "");
+	for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) {
+		ret = do_dump_one_act(tb[act]);
+		if (ret)
+			break;
+	}
+	NET_END_ARRAY(" ");
+
+	return ret;
+}
+
+static int do_bpf_filter_dump(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_BPF_MAX + 1];
+	char buf[256];
+	int ret;
+
+	if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	if (tb[TCA_BPF_NAME])
+		NET_DUMP_STR("name", nla_getattr_str(tb[TCA_BPF_NAME]));
+	if (tb[TCA_BPF_ID])
+		NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[TCA_BPF_ID]));
+	if (tb[TCA_BPF_TAG])
+		NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_BPF_TAG]),
+						  nla_len(tb[TCA_BPF_TAG]),
+						  buf, sizeof(buf)));
+	if (tb[TCA_BPF_ACT]) {
+		ret = do_bpf_act_dump(tb[TCA_BPF_ACT]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind)
+{
+	int ret = 0;
+
+	if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) {
+		NET_START_OBJECT;
+		NET_DUMP_UINT("ifindex", info->tcm_ifindex);
+		NET_DUMP_STR("kind", kind);
+		ret = do_bpf_filter_dump(tb[TCA_OPTIONS]);
+		NET_END_OBJECT_FINAL;
+	}
+
+	return ret;
+}
diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h
new file mode 100644
index 000000000000..552d8851ac06
--- /dev/null
+++ b/tools/bpf/bpftool/netlink_dumper.h
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#ifndef _NETLINK_DUMPER_H_
+#define _NETLINK_DUMPER_H_
+
+#define NET_START_OBJECT				\
+{							\
+	if (json_output)				\
+		jsonw_start_object(json_wtr);		\
+}
+
+#define NET_START_OBJECT_NESTED(name)			\
+{							\
+	if (json_output) {				\
+		jsonw_name(json_wtr, name);		\
+		jsonw_start_object(json_wtr);		\
+	} else {					\
+		fprintf(stderr, "%s {", name);		\
+	}						\
+}
+
+#define NET_START_OBJECT_NESTED2			\
+{							\
+	if (json_output)				\
+		jsonw_start_object(json_wtr);		\
+	else						\
+		fprintf(stderr, "{");			\
+}
+
+#define NET_END_OBJECT_NESTED				\
+{							\
+	if (json_output)				\
+		jsonw_end_object(json_wtr);		\
+	else						\
+		fprintf(stderr, "}");			\
+}
+
+#define NET_END_OBJECT					\
+{							\
+	if (json_output)				\
+		jsonw_end_object(json_wtr);		\
+}
+
+#define NET_END_OBJECT_FINAL				\
+{							\
+	if (json_output)				\
+		jsonw_end_object(json_wtr);		\
+	else						\
+		fprintf(stderr, "\n");			\
+}
+
+#define NET_START_ARRAY(name, newline)			\
+{							\
+	if (json_output) {				\
+		jsonw_name(json_wtr, name);		\
+		jsonw_start_array(json_wtr);		\
+	} else {					\
+		fprintf(stderr, "%s [%s", name, newline);\
+	}						\
+}
+
+#define NET_END_ARRAY(endstr)				\
+{							\
+	if (json_output)				\
+		jsonw_end_array(json_wtr);		\
+	else						\
+		fprintf(stderr, "]%s", endstr);		\
+}
+
+#define NET_DUMP_UINT(name, val)			\
+{							\
+	if (json_output)				\
+		jsonw_uint_field(json_wtr, name, val);	\
+	else						\
+		fprintf(stderr, "%s %d ", name, val);	\
+}
+
+#define NET_DUMP_LLUINT(name, val)			\
+{							\
+	if (json_output)				\
+		jsonw_lluint_field(json_wtr, name, val);\
+	else						\
+		fprintf(stderr, "%s %lld ", name, val);	\
+}
+
+#define NET_DUMP_STR(name, str)				\
+{							\
+	if (json_output)				\
+		jsonw_string_field(json_wtr, name, str);\
+	else						\
+		fprintf(stderr, "%s %s ", name, str);	\
+}
+
+#define NET_DUMP_STR_ONLY(str)				\
+{							\
+	if (json_output)				\
+		jsonw_string(json_wtr, str);		\
+	else						\
+		fprintf(stderr, "%s ", str);		\
+}
+
+#endif
-- 
Gitee


From 95289f76909d22e4ce52fca681b812ce4349e287 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:10 +0800
Subject: [PATCH 49/58] tools/bpf: fix a netlink recv issue MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ANBZ: #5530

commit 9d0b3c1f1451d1b9a33de3c70ae3d50ccd77db1a upstream

Commit f7010770fbac ("tools/bpf: move bpf/lib netlink related
functions into a new file") introduced a while loop for the
netlink recv path. This while loop is needed since the
buffer in recv syscall may not be enough to hold all the
information and in such cases multiple recv calls are needed.

There is a bug introduced by the above commit as
the while loop may block on recv syscall if there is no
more messages are expected. The netlink message header
flag NLM_F_MULTI is used to indicate that more messages
are expected and this patch fixed the bug by doing
further recv syscall only if multipart message is expected.

The patch added another fix regarding to message length of 0.
When netlink recv returns message length of 0, there will be
no more messages for returning data so the while loop
can end.

Fixes: f7010770fbac ("tools/bpf: move bpf/lib netlink related functions into a new file")
Reported-by: Björn Töpel <bjorn.topel@intel.com>
Tested-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/lib/bpf/netlink.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index 469e068dd0c5..fde1d7bf8199 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -65,18 +65,23 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 			    __dump_nlmsg_t _fn, dump_nlmsg_t fn,
 			    void *cookie)
 {
+	bool multipart = true;
 	struct nlmsgerr *err;
 	struct nlmsghdr *nh;
 	char buf[4096];
 	int len, ret;
 
-	while (1) {
+	while (multipart) {
+		multipart = false;
 		len = recv(sock, buf, sizeof(buf), 0);
 		if (len < 0) {
 			ret = -errno;
 			goto done;
 		}
 
+		if (len == 0)
+			break;
+
 		for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
 		     nh = NLMSG_NEXT(nh, len)) {
 			if (nh->nlmsg_pid != nl_pid) {
@@ -87,6 +92,8 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 				ret = -LIBBPF_ERRNO__INVSEQ;
 				goto done;
 			}
+			if (nh->nlmsg_flags & NLM_F_MULTI)
+				multipart = true;
 			switch (nh->nlmsg_type) {
 			case NLMSG_ERROR:
 				err = (struct nlmsgerr *)NLMSG_DATA(nh);
-- 
Gitee


From 59394459ea70204704a998994184452da8af68ad Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:11 +0800
Subject: [PATCH 50/58] tools/bpf: bpftool: improve output format for bpftool 
 net

ANBZ: #5530

commit 7900efc19214e326913dc0f0e8ded24adc0018f2 upstream

This is a followup patch for Commit f6f3bac08ff9
("tools/bpf: bpftool: add net support").
Some improvements are made for the bpftool net output.
Specially, plain output is more concise such that
per attachment should nicely fit in one line.
Compared to previous output, the prog tag is removed
since it can be easily obtained with program id.
Similar to xdp attachments, the device name is added
to tc attachments.

The bpf program attached through shared block
mechanism is supported as well.
  $ ip link add dev v1 type veth peer name v2
  $ tc qdisc add dev v1 ingress_block 10 egress_block 20 clsact
  $ tc qdisc add dev v2 ingress_block 10 egress_block 20 clsact
  $ tc filter add block 10 protocol ip prio 25 bpf obj bpf_shared.o sec ingress flowid 1:1
  $ tc filter add block 20 protocol ip prio 30 bpf obj bpf_cyclic.o sec classifier flowid 1:1
  $ bpftool net
  xdp:

  tc:
  v2(7) clsact/ingress bpf_shared.o:[ingress] id 23
  v2(7) clsact/egress bpf_cyclic.o:[classifier] id 24
  v1(8) clsact/ingress bpf_shared.o:[ingress] id 23
  v1(8) clsact/egress bpf_cyclic.o:[classifier] id 24

The documentation and "bpftool net help" are updated
to make it clear that current implementation only
supports xdp and tc attachments. For programs
attached to cgroups, "bpftool cgroup" can be used
to dump attachments. For other programs e.g.
sk_{filter,skb,msg,reuseport} and lwt/seg6,
iproute2 tools should be used.

The new output:
  $ bpftool net
  xdp:
  eth0(2) driver id 198

  tc:
  eth0(2) clsact/ingress fbflow_icmp id 335 act [{icmp_action id 336}]
  eth0(2) clsact/egress fbflow_egress id 334
  $ bpftool -jp net
  [{
        "xdp": [{
                "devname": "eth0",
                "ifindex": 2,
                "mode": "driver",
                "id": 198
            }
        ],
        "tc": [{
                "devname": "eth0",
                "ifindex": 2,
                "kind": "clsact/ingress",
                "name": "fbflow_icmp",
                "id": 335,
               "act": [{
                        "name": "icmp_action",
                        "id": 336
                    }
                ]
            },{
                "devname": "eth0",
                "ifindex": 2,
                "kind": "clsact/egress",
                "name": "fbflow_egress",
                "id": 334
            }
        ]
    }
  ]

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 .../bpf/bpftool/Documentation/bpftool-net.rst |  78 +++++++------
 tools/bpf/bpftool/main.h                      |   3 +-
 tools/bpf/bpftool/net.c                       | 103 ++++++++++++------
 tools/bpf/bpftool/netlink_dumper.c            |  85 +++++++--------
 tools/bpf/bpftool/netlink_dumper.h            |  22 ++--
 5 files changed, 161 insertions(+), 130 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 48a61837a264..408ec30d8872 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -26,9 +26,20 @@ NET COMMANDS
 DESCRIPTION
 ===========
 	**bpftool net { show | list } [ dev name ]**
-		  List all networking device driver and tc attachment in the system.
-
-                  Output will start with all xdp program attachment, followed by
+                  List bpf program attachments in the kernel networking subsystem.
+
+                  Currently, only device driver xdp attachments and tc filter
+                  classification/action attachments are implemented, i.e., for
+                  program types **BPF_PROG_TYPE_SCHED_CLS**,
+                  **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**.
+                  For programs attached to a particular cgroup, e.g.,
+                  **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
+                  **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+                  users can use **bpftool cgroup** to dump cgroup attachments.
+                  For sk_{filter, skb, msg, reuseport} and lwt/seg6
+                  bpf programs, users should consult other tools, e.g., iproute2.
+
+                  The current output will start with all xdp program attachments, followed by
                   all tc class/qdisc bpf program attachments. Both xdp programs and
                   tc programs are ordered based on ifindex number. If multiple bpf
                   programs attached to the same networking device through **tc filter**,
@@ -61,21 +72,15 @@ EXAMPLES
 
 ::
 
-      xdp [
-      ifindex 2 devname eth0 prog_id 198
-      ]
-      tc_filters [
-      ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb]
-                prog_id 111727 tag d08fe3b4319bc2fd act []
-      ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp
-                prog_id 130246 tag 3f265c7f26db62c9 act []
-      ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact]
-                prog_id 111726 tag 99a197826974c876
-      ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp
-                prog_id 108619 tag dc4630674fd72dcc act []
-      ifindex 2 kind qdisc_clsact_egress name fbflow_egress
-                prog_id 130245 tag 72d2d830d6888d2c
-      ]
+      xdp:
+      eth0(2) driver id 198
+
+      tc:
+      eth0(2) htb name prefix_matcher.o:[cls_prefix_matcher_htb] id 111727 act []
+      eth0(2) clsact/ingress fbflow_icmp id 130246 act []
+      eth0(2) clsact/egress prefix_matcher.o:[cls_prefix_matcher_clsact] id 111726
+      eth0(2) clsact/egress cls_fg_dscp id 108619 act []
+      eth0(2) clsact/egress fbflow_egress id 130245
 
 |
 | **# bpftool -jp net**
@@ -84,44 +89,45 @@ EXAMPLES
 
     [{
             "xdp": [{
-                    "ifindex": 2,
                     "devname": "eth0",
-                    "prog_id": 198
+                    "ifindex": 2,
+                    "mode": "driver",
+                    "id": 198
                 }
             ],
-            "tc_filters": [{
+            "tc": [{
+                    "devname": "eth0",
                     "ifindex": 2,
-                    "kind": "qdisc_htb",
+                    "kind": "htb",
                     "name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
-                    "prog_id": 111727,
-                    "tag": "d08fe3b4319bc2fd",
+                    "id": 111727,
                     "act": []
                 },{
+                    "devname": "eth0",
                     "ifindex": 2,
-                    "kind": "qdisc_clsact_ingress",
+                    "kind": "clsact/ingress",
                     "name": "fbflow_icmp",
-                    "prog_id": 130246,
-                    "tag": "3f265c7f26db62c9",
+                    "id": 130246,
                     "act": []
                 },{
+                    "devname": "eth0",
                     "ifindex": 2,
-                    "kind": "qdisc_clsact_egress",
+                    "kind": "clsact/egress",
                     "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
-                    "prog_id": 111726,
-                    "tag": "99a197826974c876"
+                    "id": 111726,
                 },{
+                    "devname": "eth0",
                     "ifindex": 2,
-                    "kind": "qdisc_clsact_egress",
+                    "kind": "clsact/egress",
                     "name": "cls_fg_dscp",
-                    "prog_id": 108619,
-                    "tag": "dc4630674fd72dcc",
+                    "id": 108619,
                     "act": []
                 },{
+                    "devname": "eth0",
                     "ifindex": 2,
-                    "kind": "qdisc_clsact_egress",
+                    "kind": "clsact/egress",
                     "name": "fbflow_egress",
-                    "prog_id": 130245,
-                    "tag": "72d2d830d6888d2c"
+                    "id": 130245,
                 }
             ]
         }
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 0a9d2779635c..38413a16ab92 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -171,5 +171,6 @@ struct nlattr;
 struct ifinfomsg;
 struct tcmsg;
 int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb);
-int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind);
+int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind,
+		   const char *devname, int ifindex);
 #endif
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 77dd73dd9ade..ed205ee57655 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -2,6 +2,7 @@
 // Copyright (C) 2018 Facebook
 
 #define _GNU_SOURCE
+#include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -17,8 +18,13 @@
 #include "main.h"
 #include "netlink_dumper.h"
 
+struct ip_devname_ifindex {
+	char	devname[64];
+	int	ifindex;
+};
+
 struct bpf_netdev_t {
-	int	*ifindex_array;
+	struct ip_devname_ifindex *devices;
 	int	used_len;
 	int	array_len;
 	int	filter_idx;
@@ -36,6 +42,12 @@ struct bpf_tcinfo_t {
 	bool			is_qdisc;
 };
 
+struct bpf_filter_t {
+	const char	*kind;
+	const char	*devname;
+	int		ifindex;
+};
+
 static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 {
 	struct bpf_netdev_t *netinfo = cookie;
@@ -45,11 +57,20 @@ static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 		return 0;
 
 	if (netinfo->used_len == netinfo->array_len) {
-		netinfo->ifindex_array = realloc(netinfo->ifindex_array,
-			(netinfo->array_len + 16) * sizeof(int));
+		netinfo->devices = realloc(netinfo->devices,
+			(netinfo->array_len + 16) *
+			sizeof(struct ip_devname_ifindex));
+		if (!netinfo->devices)
+			return -ENOMEM;
+
 		netinfo->array_len += 16;
 	}
-	netinfo->ifindex_array[netinfo->used_len++] = ifinfo->ifi_index;
+	netinfo->devices[netinfo->used_len].ifindex = ifinfo->ifi_index;
+	snprintf(netinfo->devices[netinfo->used_len].devname,
+		 sizeof(netinfo->devices[netinfo->used_len].devname),
+		 "%s",
+		 tb[IFLA_IFNAME] ? nla_getattr_str(tb[IFLA_IFNAME]) : "");
+	netinfo->used_len++;
 
 	return do_xdp_dump(ifinfo, tb);
 }
@@ -71,13 +92,15 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 	if (tcinfo->used_len == tcinfo->array_len) {
 		tcinfo->handle_array = realloc(tcinfo->handle_array,
 			(tcinfo->array_len + 16) * sizeof(struct tc_kind_handle));
+		if (!tcinfo->handle_array)
+			return -ENOMEM;
+
 		tcinfo->array_len += 16;
 	}
 	tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle;
 	snprintf(tcinfo->handle_array[tcinfo->used_len].kind,
 		 sizeof(tcinfo->handle_array[tcinfo->used_len].kind),
-		 "%s_%s",
-		 tcinfo->is_qdisc ? "qdisc" : "class",
+		 "%s",
 		 tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown");
 	tcinfo->used_len++;
 
@@ -86,60 +109,71 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 
 static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 {
-	const char *kind = cookie;
+	const struct bpf_filter_t *filter_info = cookie;
 
-	return do_filter_dump((struct tcmsg *)msg, tb, kind);
+	return do_filter_dump((struct tcmsg *)msg, tb, filter_info->kind,
+			      filter_info->devname, filter_info->ifindex);
 }
 
-static int show_dev_tc_bpf(int sock, unsigned int nl_pid, int ifindex)
+static int show_dev_tc_bpf(int sock, unsigned int nl_pid,
+			   struct ip_devname_ifindex *dev)
 {
+	struct bpf_filter_t filter_info;
 	struct bpf_tcinfo_t tcinfo;
-	int i, handle, ret;
+	int i, handle, ret = 0;
 
 	tcinfo.handle_array = NULL;
 	tcinfo.used_len = 0;
 	tcinfo.array_len = 0;
 
 	tcinfo.is_qdisc = false;
-	ret = nl_get_class(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg,
+	ret = nl_get_class(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
 			   &tcinfo);
 	if (ret)
-		return ret;
+		goto out;
 
 	tcinfo.is_qdisc = true;
-	ret = nl_get_qdisc(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg,
+	ret = nl_get_qdisc(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
 			   &tcinfo);
 	if (ret)
-		return ret;
+		goto out;
 
+	filter_info.devname = dev->devname;
+	filter_info.ifindex = dev->ifindex;
 	for (i = 0; i < tcinfo.used_len; i++) {
-		ret = nl_get_filter(sock, nl_pid, ifindex,
+		filter_info.kind = tcinfo.handle_array[i].kind;
+		ret = nl_get_filter(sock, nl_pid, dev->ifindex,
 				    tcinfo.handle_array[i].handle,
 				    dump_filter_nlmsg,
-				    tcinfo.handle_array[i].kind);
+				    &filter_info);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	/* root, ingress and egress handle */
 	handle = TC_H_ROOT;
-	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
-			    "root");
+	filter_info.kind = "root";
+	ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+			    dump_filter_nlmsg, &filter_info);
 	if (ret)
-		return ret;
+		goto out;
 
 	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
-	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
-			    "qdisc_clsact_ingress");
+	filter_info.kind = "clsact/ingress";
+	ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+			    dump_filter_nlmsg, &filter_info);
 	if (ret)
-		return ret;
+		goto out;
 
 	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
-	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
-			    "qdisc_clsact_egress");
+	filter_info.kind = "clsact/egress";
+	ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+			    dump_filter_nlmsg, &filter_info);
 	if (ret)
-		return ret;
+		goto out;
 
+out:
+	free(tcinfo.handle_array);
 	return 0;
 }
 
@@ -168,7 +202,7 @@ static int do_show(int argc, char **argv)
 		return -1;
 	}
 
-	dev_array.ifindex_array = NULL;
+	dev_array.devices = NULL;
 	dev_array.used_len = 0;
 	dev_array.array_len = 0;
 	dev_array.filter_idx = filter_idx;
@@ -176,15 +210,15 @@ static int do_show(int argc, char **argv)
 	if (json_output)
 		jsonw_start_array(json_wtr);
 	NET_START_OBJECT;
-	NET_START_ARRAY("xdp", "\n");
+	NET_START_ARRAY("xdp", "%s:\n");
 	ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array);
 	NET_END_ARRAY("\n");
 
 	if (!ret) {
-		NET_START_ARRAY("tc_filters", "\n");
+		NET_START_ARRAY("tc", "%s:\n");
 		for (i = 0; i < dev_array.used_len; i++) {
 			ret = show_dev_tc_bpf(sock, nl_pid,
-					      dev_array.ifindex_array[i]);
+					      &dev_array.devices[i]);
 			if (ret)
 				break;
 		}
@@ -200,7 +234,7 @@ static int do_show(int argc, char **argv)
 		libbpf_strerror(ret, err_buf, sizeof(err_buf));
 		fprintf(stderr, "Error: %s\n", err_buf);
 	}
-	free(dev_array.ifindex_array);
+	free(dev_array.devices);
 	close(sock);
 	return ret;
 }
@@ -214,7 +248,12 @@ static int do_help(int argc, char **argv)
 
 	fprintf(stderr,
 		"Usage: %s %s { show | list } [dev <devname>]\n"
-		"       %s %s help\n",
+		"       %s %s help\n"
+		"Note: Only xdp and tc attachments are supported now.\n"
+		"      For progs attached to cgroups, use \"bpftool cgroup\"\n"
+		"      to dump program attachments. For program types\n"
+		"      sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n"
+		"      consult iproute2.\n",
 		bin_name, argv[-2], bin_name, argv[-2]);
 
 	return 0;
diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c
index e12494fd1d2e..6f5e9cc6836c 100644
--- a/tools/bpf/bpftool/netlink_dumper.c
+++ b/tools/bpf/bpftool/netlink_dumper.c
@@ -12,12 +12,18 @@
 #include "netlink_dumper.h"
 
 static void xdp_dump_prog_id(struct nlattr **tb, int attr,
-			     const char *type)
+			     const char *mode,
+			     bool new_json_object)
 {
 	if (!tb[attr])
 		return;
 
-	NET_DUMP_UINT(type, nla_getattr_u32(tb[attr]))
+	if (new_json_object)
+		NET_START_OBJECT
+	NET_DUMP_STR("mode", " %s", mode);
+	NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[attr]))
+	if (new_json_object)
+		NET_END_OBJECT
 }
 
 static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex,
@@ -37,18 +43,26 @@ static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex,
 		return 0;
 
 	NET_START_OBJECT;
-	NET_DUMP_UINT("ifindex", ifindex);
-
 	if (name)
-		NET_DUMP_STR("devname", name);
-
-	if (tb[IFLA_XDP_PROG_ID])
-		NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[IFLA_XDP_PROG_ID]));
+		NET_DUMP_STR("devname", "%s", name);
+	NET_DUMP_UINT("ifindex", "(%d)", ifindex);
 
 	if (mode == XDP_ATTACHED_MULTI) {
-		xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic_prog_id");
-		xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "drv_prog_id");
-		xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload_prog_id");
+		if (json_output) {
+			jsonw_name(json_wtr, "multi_attachments");
+			jsonw_start_array(json_wtr);
+		}
+		xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic", true);
+		xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "driver", true);
+		xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload", true);
+		if (json_output)
+			jsonw_end_array(json_wtr);
+	} else if (mode == XDP_ATTACHED_DRV) {
+		xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "driver", false);
+	} else if (mode == XDP_ATTACHED_SKB) {
+		xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "generic", false);
+	} else if (mode == XDP_ATTACHED_HW) {
+		xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "offload", false);
 	}
 
 	NET_END_OBJECT_FINAL;
@@ -64,26 +78,9 @@ int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb)
 			       nla_getattr_str(tb[IFLA_IFNAME]));
 }
 
-static char *hexstring_n2a(const unsigned char *str, int len,
-			   char *buf, int blen)
-{
-	char *ptr = buf;
-	int i;
-
-	for (i = 0; i < len; i++) {
-		if (blen < 3)
-			break;
-		sprintf(ptr, "%02x", str[i]);
-		ptr += 2;
-		blen -= 2;
-	}
-	return buf;
-}
-
 static int do_bpf_dump_one_act(struct nlattr *attr)
 {
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
-	char buf[256];
 
 	if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0)
 		return -LIBBPF_ERRNO__NLPARSE;
@@ -93,13 +90,11 @@ static int do_bpf_dump_one_act(struct nlattr *attr)
 
 	NET_START_OBJECT_NESTED2;
 	if (tb[TCA_ACT_BPF_NAME])
-		NET_DUMP_STR("name", nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
+		NET_DUMP_STR("name", "%s",
+			     nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
 	if (tb[TCA_ACT_BPF_ID])
-		NET_DUMP_UINT("bpf_id", nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
-	if (tb[TCA_ACT_BPF_TAG])
-		NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_ACT_BPF_TAG]),
-						  nla_len(tb[TCA_ACT_BPF_TAG]),
-						  buf, sizeof(buf)));
+		NET_DUMP_UINT("id", " id %u",
+			      nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
 	NET_END_OBJECT_NESTED;
 	return 0;
 }
@@ -128,13 +123,13 @@ static int do_bpf_act_dump(struct nlattr *attr)
 	if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
-	NET_START_ARRAY("act", "");
+	NET_START_ARRAY("act", " %s [");
 	for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) {
 		ret = do_dump_one_act(tb[act]);
 		if (ret)
 			break;
 	}
-	NET_END_ARRAY(" ");
+	NET_END_ARRAY("] ");
 
 	return ret;
 }
@@ -142,20 +137,15 @@ static int do_bpf_act_dump(struct nlattr *attr)
 static int do_bpf_filter_dump(struct nlattr *attr)
 {
 	struct nlattr *tb[TCA_BPF_MAX + 1];
-	char buf[256];
 	int ret;
 
 	if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	if (tb[TCA_BPF_NAME])
-		NET_DUMP_STR("name", nla_getattr_str(tb[TCA_BPF_NAME]));
+		NET_DUMP_STR("name", " %s", nla_getattr_str(tb[TCA_BPF_NAME]));
 	if (tb[TCA_BPF_ID])
-		NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[TCA_BPF_ID]));
-	if (tb[TCA_BPF_TAG])
-		NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_BPF_TAG]),
-						  nla_len(tb[TCA_BPF_TAG]),
-						  buf, sizeof(buf)));
+		NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[TCA_BPF_ID]));
 	if (tb[TCA_BPF_ACT]) {
 		ret = do_bpf_act_dump(tb[TCA_BPF_ACT]);
 		if (ret)
@@ -165,14 +155,17 @@ static int do_bpf_filter_dump(struct nlattr *attr)
 	return 0;
 }
 
-int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind)
+int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind,
+		   const char *devname, int ifindex)
 {
 	int ret = 0;
 
 	if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) {
 		NET_START_OBJECT;
-		NET_DUMP_UINT("ifindex", info->tcm_ifindex);
-		NET_DUMP_STR("kind", kind);
+		if (devname[0] != '\0')
+			NET_DUMP_STR("devname", "%s", devname);
+		NET_DUMP_UINT("ifindex", "(%u)", ifindex);
+		NET_DUMP_STR("kind", " %s", kind);
 		ret = do_bpf_filter_dump(tb[TCA_OPTIONS]);
 		NET_END_OBJECT_FINAL;
 	}
diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h
index 552d8851ac06..0788cfbbed0e 100644
--- a/tools/bpf/bpftool/netlink_dumper.h
+++ b/tools/bpf/bpftool/netlink_dumper.h
@@ -50,13 +50,13 @@
 		fprintf(stderr, "\n");			\
 }
 
-#define NET_START_ARRAY(name, newline)			\
+#define NET_START_ARRAY(name, fmt_str)			\
 {							\
 	if (json_output) {				\
 		jsonw_name(json_wtr, name);		\
 		jsonw_start_array(json_wtr);		\
 	} else {					\
-		fprintf(stderr, "%s [%s", name, newline);\
+		fprintf(stderr, fmt_str, name);		\
 	}						\
 }
 
@@ -65,31 +65,23 @@
 	if (json_output)				\
 		jsonw_end_array(json_wtr);		\
 	else						\
-		fprintf(stderr, "]%s", endstr);		\
+		fprintf(stderr, "%s", endstr);		\
 }
 
-#define NET_DUMP_UINT(name, val)			\
+#define NET_DUMP_UINT(name, fmt_str, val)		\
 {							\
 	if (json_output)				\
 		jsonw_uint_field(json_wtr, name, val);	\
 	else						\
-		fprintf(stderr, "%s %d ", name, val);	\
+		fprintf(stderr, fmt_str, val);		\
 }
 
-#define NET_DUMP_LLUINT(name, val)			\
-{							\
-	if (json_output)				\
-		jsonw_lluint_field(json_wtr, name, val);\
-	else						\
-		fprintf(stderr, "%s %lld ", name, val);	\
-}
-
-#define NET_DUMP_STR(name, str)				\
+#define NET_DUMP_STR(name, fmt_str, str)		\
 {							\
 	if (json_output)				\
 		jsonw_string_field(json_wtr, name, str);\
 	else						\
-		fprintf(stderr, "%s %s ", name, str);	\
+		fprintf(stderr, fmt_str, str);		\
 }
 
 #define NET_DUMP_STR_ONLY(str)				\
-- 
Gitee


From 445e3ab88e458b9d3f55725ccbd64b372cf5220e Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:12 +0800
Subject: [PATCH 51/58] bpftool: Fix bpftool net output

ANBZ: #5530

commit 53d6eb08e9f185834231d5c32499b10310cce3aa upstream

Print `bpftool net` output to stdout instead of stderr. Only errors
should be printed to stderr. Regular output should go to stdout and this
is what all other subcommands of bpftool do, including --json and
--pretty formats of `bpftool net` itself.

Fixes: commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/bpf/bpftool/netlink_dumper.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h
index 0788cfbbed0e..e3516b586a34 100644
--- a/tools/bpf/bpftool/netlink_dumper.h
+++ b/tools/bpf/bpftool/netlink_dumper.h
@@ -16,7 +16,7 @@
 		jsonw_name(json_wtr, name);		\
 		jsonw_start_object(json_wtr);		\
 	} else {					\
-		fprintf(stderr, "%s {", name);		\
+		fprintf(stdout, "%s {", name);		\
 	}						\
 }
 
@@ -25,7 +25,7 @@
 	if (json_output)				\
 		jsonw_start_object(json_wtr);		\
 	else						\
-		fprintf(stderr, "{");			\
+		fprintf(stdout, "{");			\
 }
 
 #define NET_END_OBJECT_NESTED				\
@@ -33,7 +33,7 @@
 	if (json_output)				\
 		jsonw_end_object(json_wtr);		\
 	else						\
-		fprintf(stderr, "}");			\
+		fprintf(stdout, "}");			\
 }
 
 #define NET_END_OBJECT					\
@@ -47,7 +47,7 @@
 	if (json_output)				\
 		jsonw_end_object(json_wtr);		\
 	else						\
-		fprintf(stderr, "\n");			\
+		fprintf(stdout, "\n");			\
 }
 
 #define NET_START_ARRAY(name, fmt_str)			\
@@ -56,7 +56,7 @@
 		jsonw_name(json_wtr, name);		\
 		jsonw_start_array(json_wtr);		\
 	} else {					\
-		fprintf(stderr, fmt_str, name);		\
+		fprintf(stdout, fmt_str, name);		\
 	}						\
 }
 
@@ -65,7 +65,7 @@
 	if (json_output)				\
 		jsonw_end_array(json_wtr);		\
 	else						\
-		fprintf(stderr, "%s", endstr);		\
+		fprintf(stdout, "%s", endstr);		\
 }
 
 #define NET_DUMP_UINT(name, fmt_str, val)		\
@@ -73,7 +73,7 @@
 	if (json_output)				\
 		jsonw_uint_field(json_wtr, name, val);	\
 	else						\
-		fprintf(stderr, fmt_str, val);		\
+		fprintf(stdout, fmt_str, val);		\
 }
 
 #define NET_DUMP_STR(name, fmt_str, str)		\
@@ -81,7 +81,7 @@
 	if (json_output)				\
 		jsonw_string_field(json_wtr, name, str);\
 	else						\
-		fprintf(stderr, fmt_str, str);		\
+		fprintf(stdout, fmt_str, str);		\
 }
 
 #define NET_DUMP_STR_ONLY(str)				\
@@ -89,7 +89,7 @@
 	if (json_output)				\
 		jsonw_string(json_wtr, str);		\
 	else						\
-		fprintf(stderr, "%s ", str);		\
+		fprintf(stdout, "%s ", str);		\
 }
 
 #endif
-- 
Gitee


From f23b5d8363a27b4896bee1b11ec7712107b34306 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:13 +0800
Subject: [PATCH 52/58] tools/bpftool: copy a few net uapi headers to tools 
 directory

ANBZ: #5530

commit 49a249c387268882e8393dd1fafc51e3b21cb7a2 upstream

Commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
added certain networking support to bpftool.
The implementation relies on a relatively recent uapi header file
linux/tc_act/tc_bpf.h on the host which contains the marco
definition of TCA_ACT_BPF_ID.

Unfortunately, this is not the case for all distributions.
See the email message below where rhel-7.2 does not have
an up-to-date linux/tc_act/tc_bpf.h.
  https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1799211.html
Further investigation found that linux/pkt_cls.h is also needed for macro
TCA_BPF_TAG.

This patch fixed the issue by copying linux/tc_act/tc_bpf.h
and linux/pkt_cls.h from kernel include/uapi directory to
tools/include/uapi directory so building the bpftool does not depend
on host system for these files.

Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Cc: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/include/uapi/linux/pkt_cls.h       | 612 +++++++++++++++++++++++
 tools/include/uapi/linux/tc_act/tc_bpf.h |  37 ++
 2 files changed, 649 insertions(+)
 create mode 100644 tools/include/uapi/linux/pkt_cls.h
 create mode 100644 tools/include/uapi/linux/tc_act/tc_bpf.h

diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h
new file mode 100644
index 000000000000..682cda46aa1b
--- /dev/null
+++ b/tools/include/uapi/linux/pkt_cls.h
@@ -0,0 +1,612 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_PKT_CLS_H
+#define __LINUX_PKT_CLS_H
+
+#include <linux/types.h>
+#include <linux/pkt_sched.h>
+
+#define TC_COOKIE_MAX_SIZE 16
+
+/* Action attributes */
+enum {
+	TCA_ACT_UNSPEC,
+	TCA_ACT_KIND,
+	TCA_ACT_OPTIONS,
+	TCA_ACT_INDEX,
+	TCA_ACT_STATS,
+	TCA_ACT_PAD,
+	TCA_ACT_COOKIE,
+	__TCA_ACT_MAX
+};
+
+#define TCA_ACT_MAX __TCA_ACT_MAX
+#define TCA_OLD_COMPAT (TCA_ACT_MAX+1)
+#define TCA_ACT_MAX_PRIO 32
+#define TCA_ACT_BIND	1
+#define TCA_ACT_NOBIND	0
+#define TCA_ACT_UNBIND	1
+#define TCA_ACT_NOUNBIND	0
+#define TCA_ACT_REPLACE		1
+#define TCA_ACT_NOREPLACE	0
+
+#define TC_ACT_UNSPEC	(-1)
+#define TC_ACT_OK		0
+#define TC_ACT_RECLASSIFY	1
+#define TC_ACT_SHOT		2
+#define TC_ACT_PIPE		3
+#define TC_ACT_STOLEN		4
+#define TC_ACT_QUEUED		5
+#define TC_ACT_REPEAT		6
+#define TC_ACT_REDIRECT		7
+#define TC_ACT_TRAP		8 /* For hw path, this means "trap to cpu"
+				   * and don't further process the frame
+				   * in hardware. For sw path, this is
+				   * equivalent of TC_ACT_STOLEN - drop
+				   * the skb and act like everything
+				   * is alright.
+				   */
+#define TC_ACT_VALUE_MAX	TC_ACT_TRAP
+
+/* There is a special kind of actions called "extended actions",
+ * which need a value parameter. These have a local opcode located in
+ * the highest nibble, starting from 1. The rest of the bits
+ * are used to carry the value. These two parts together make
+ * a combined opcode.
+ */
+#define __TC_ACT_EXT_SHIFT 28
+#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT)
+#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1)
+#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK))
+#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode)
+
+#define TC_ACT_JUMP __TC_ACT_EXT(1)
+#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
+#define TC_ACT_EXT_OPCODE_MAX	TC_ACT_GOTO_CHAIN
+
+/* Action type identifiers*/
+enum {
+	TCA_ID_UNSPEC=0,
+	TCA_ID_POLICE=1,
+	/* other actions go here */
+	__TCA_ID_MAX=255
+};
+
+#define TCA_ID_MAX __TCA_ID_MAX
+
+struct tc_police {
+	__u32			index;
+	int			action;
+#define TC_POLICE_UNSPEC	TC_ACT_UNSPEC
+#define TC_POLICE_OK		TC_ACT_OK
+#define TC_POLICE_RECLASSIFY	TC_ACT_RECLASSIFY
+#define TC_POLICE_SHOT		TC_ACT_SHOT
+#define TC_POLICE_PIPE		TC_ACT_PIPE
+
+	__u32			limit;
+	__u32			burst;
+	__u32			mtu;
+	struct tc_ratespec	rate;
+	struct tc_ratespec	peakrate;
+	int			refcnt;
+	int			bindcnt;
+	__u32			capab;
+};
+
+struct tcf_t {
+	__u64   install;
+	__u64   lastuse;
+	__u64   expires;
+	__u64   firstuse;
+};
+
+struct tc_cnt {
+	int                   refcnt;
+	int                   bindcnt;
+};
+
+#define tc_gen \
+	__u32                 index; \
+	__u32                 capab; \
+	int                   action; \
+	int                   refcnt; \
+	int                   bindcnt
+
+enum {
+	TCA_POLICE_UNSPEC,
+	TCA_POLICE_TBF,
+	TCA_POLICE_RATE,
+	TCA_POLICE_PEAKRATE,
+	TCA_POLICE_AVRATE,
+	TCA_POLICE_RESULT,
+	TCA_POLICE_TM,
+	TCA_POLICE_PAD,
+	__TCA_POLICE_MAX
+#define TCA_POLICE_RESULT TCA_POLICE_RESULT
+};
+
+#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1)
+
+/* tca flags definitions */
+#define TCA_CLS_FLAGS_SKIP_HW	(1 << 0) /* don't offload filter to HW */
+#define TCA_CLS_FLAGS_SKIP_SW	(1 << 1) /* don't use filter in SW */
+#define TCA_CLS_FLAGS_IN_HW	(1 << 2) /* filter is offloaded to HW */
+#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */
+#define TCA_CLS_FLAGS_VERBOSE	(1 << 4) /* verbose logging */
+
+/* U32 filters */
+
+#define TC_U32_HTID(h) ((h)&0xFFF00000)
+#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20)
+#define TC_U32_HASH(h) (((h)>>12)&0xFF)
+#define TC_U32_NODE(h) ((h)&0xFFF)
+#define TC_U32_KEY(h) ((h)&0xFFFFF)
+#define TC_U32_UNSPEC	0
+#define TC_U32_ROOT	(0xFFF00000)
+
+enum {
+	TCA_U32_UNSPEC,
+	TCA_U32_CLASSID,
+	TCA_U32_HASH,
+	TCA_U32_LINK,
+	TCA_U32_DIVISOR,
+	TCA_U32_SEL,
+	TCA_U32_POLICE,
+	TCA_U32_ACT,
+	TCA_U32_INDEV,
+	TCA_U32_PCNT,
+	TCA_U32_MARK,
+	TCA_U32_FLAGS,
+	TCA_U32_PAD,
+	__TCA_U32_MAX
+};
+
+#define TCA_U32_MAX (__TCA_U32_MAX - 1)
+
+struct tc_u32_key {
+	__be32		mask;
+	__be32		val;
+	int		off;
+	int		offmask;
+};
+
+struct tc_u32_sel {
+	unsigned char		flags;
+	unsigned char		offshift;
+	unsigned char		nkeys;
+
+	__be16			offmask;
+	__u16			off;
+	short			offoff;
+
+	short			hoff;
+	__be32			hmask;
+	struct tc_u32_key	keys[0];
+};
+
+struct tc_u32_mark {
+	__u32		val;
+	__u32		mask;
+	__u32		success;
+};
+
+struct tc_u32_pcnt {
+	__u64 rcnt;
+	__u64 rhit;
+	__u64 kcnts[0];
+};
+
+/* Flags */
+
+#define TC_U32_TERMINAL		1
+#define TC_U32_OFFSET		2
+#define TC_U32_VAROFFSET	4
+#define TC_U32_EAT		8
+
+#define TC_U32_MAXDEPTH 8
+
+
+/* RSVP filter */
+
+enum {
+	TCA_RSVP_UNSPEC,
+	TCA_RSVP_CLASSID,
+	TCA_RSVP_DST,
+	TCA_RSVP_SRC,
+	TCA_RSVP_PINFO,
+	TCA_RSVP_POLICE,
+	TCA_RSVP_ACT,
+	__TCA_RSVP_MAX
+};
+
+#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 )
+
+struct tc_rsvp_gpi {
+	__u32	key;
+	__u32	mask;
+	int	offset;
+};
+
+struct tc_rsvp_pinfo {
+	struct tc_rsvp_gpi dpi;
+	struct tc_rsvp_gpi spi;
+	__u8	protocol;
+	__u8	tunnelid;
+	__u8	tunnelhdr;
+	__u8	pad;
+};
+
+/* ROUTE filter */
+
+enum {
+	TCA_ROUTE4_UNSPEC,
+	TCA_ROUTE4_CLASSID,
+	TCA_ROUTE4_TO,
+	TCA_ROUTE4_FROM,
+	TCA_ROUTE4_IIF,
+	TCA_ROUTE4_POLICE,
+	TCA_ROUTE4_ACT,
+	__TCA_ROUTE4_MAX
+};
+
+#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1)
+
+
+/* FW filter */
+
+enum {
+	TCA_FW_UNSPEC,
+	TCA_FW_CLASSID,
+	TCA_FW_POLICE,
+	TCA_FW_INDEV, /*  used by CONFIG_NET_CLS_IND */
+	TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */
+	TCA_FW_MASK,
+	__TCA_FW_MAX
+};
+
+#define TCA_FW_MAX (__TCA_FW_MAX - 1)
+
+/* TC index filter */
+
+enum {
+	TCA_TCINDEX_UNSPEC,
+	TCA_TCINDEX_HASH,
+	TCA_TCINDEX_MASK,
+	TCA_TCINDEX_SHIFT,
+	TCA_TCINDEX_FALL_THROUGH,
+	TCA_TCINDEX_CLASSID,
+	TCA_TCINDEX_POLICE,
+	TCA_TCINDEX_ACT,
+	__TCA_TCINDEX_MAX
+};
+
+#define TCA_TCINDEX_MAX     (__TCA_TCINDEX_MAX - 1)
+
+/* Flow filter */
+
+enum {
+	FLOW_KEY_SRC,
+	FLOW_KEY_DST,
+	FLOW_KEY_PROTO,
+	FLOW_KEY_PROTO_SRC,
+	FLOW_KEY_PROTO_DST,
+	FLOW_KEY_IIF,
+	FLOW_KEY_PRIORITY,
+	FLOW_KEY_MARK,
+	FLOW_KEY_NFCT,
+	FLOW_KEY_NFCT_SRC,
+	FLOW_KEY_NFCT_DST,
+	FLOW_KEY_NFCT_PROTO_SRC,
+	FLOW_KEY_NFCT_PROTO_DST,
+	FLOW_KEY_RTCLASSID,
+	FLOW_KEY_SKUID,
+	FLOW_KEY_SKGID,
+	FLOW_KEY_VLAN_TAG,
+	FLOW_KEY_RXHASH,
+	__FLOW_KEY_MAX,
+};
+
+#define FLOW_KEY_MAX	(__FLOW_KEY_MAX - 1)
+
+enum {
+	FLOW_MODE_MAP,
+	FLOW_MODE_HASH,
+};
+
+enum {
+	TCA_FLOW_UNSPEC,
+	TCA_FLOW_KEYS,
+	TCA_FLOW_MODE,
+	TCA_FLOW_BASECLASS,
+	TCA_FLOW_RSHIFT,
+	TCA_FLOW_ADDEND,
+	TCA_FLOW_MASK,
+	TCA_FLOW_XOR,
+	TCA_FLOW_DIVISOR,
+	TCA_FLOW_ACT,
+	TCA_FLOW_POLICE,
+	TCA_FLOW_EMATCHES,
+	TCA_FLOW_PERTURB,
+	__TCA_FLOW_MAX
+};
+
+#define TCA_FLOW_MAX	(__TCA_FLOW_MAX - 1)
+
+/* Basic filter */
+
+enum {
+	TCA_BASIC_UNSPEC,
+	TCA_BASIC_CLASSID,
+	TCA_BASIC_EMATCHES,
+	TCA_BASIC_ACT,
+	TCA_BASIC_POLICE,
+	__TCA_BASIC_MAX
+};
+
+#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1)
+
+
+/* Cgroup classifier */
+
+enum {
+	TCA_CGROUP_UNSPEC,
+	TCA_CGROUP_ACT,
+	TCA_CGROUP_POLICE,
+	TCA_CGROUP_EMATCHES,
+	__TCA_CGROUP_MAX,
+};
+
+#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1)
+
+/* BPF classifier */
+
+#define TCA_BPF_FLAG_ACT_DIRECT		(1 << 0)
+
+enum {
+	TCA_BPF_UNSPEC,
+	TCA_BPF_ACT,
+	TCA_BPF_POLICE,
+	TCA_BPF_CLASSID,
+	TCA_BPF_OPS_LEN,
+	TCA_BPF_OPS,
+	TCA_BPF_FD,
+	TCA_BPF_NAME,
+	TCA_BPF_FLAGS,
+	TCA_BPF_FLAGS_GEN,
+	TCA_BPF_TAG,
+	TCA_BPF_ID,
+	__TCA_BPF_MAX,
+};
+
+#define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
+
+/* Flower classifier */
+
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,	/* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,	/* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,	/* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,	/* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,		/* be16 */
+	TCA_FLOWER_KEY_TCP_DST,		/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
+	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
+
+	TCA_FLOWER_FLAGS,
+	TCA_FLOWER_KEY_VLAN_ID,		/* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,	/* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,	/* be16 */
+
+	TCA_FLOWER_KEY_ENC_KEY_ID,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
+
+	TCA_FLOWER_KEY_TCP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_TCP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_DST_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_SCTP_SRC,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_DST,	/* be16 */
+
+	TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_DST_PORT,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_FLAGS,		/* be32 */
+	TCA_FLOWER_KEY_FLAGS_MASK,	/* be32 */
+
+	TCA_FLOWER_KEY_ICMPV4_CODE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_TYPE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_CODE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_TYPE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */
+
+	TCA_FLOWER_KEY_ARP_SIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_SIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_OP,		/* u8 */
+	TCA_FLOWER_KEY_ARP_OP_MASK,	/* u8 */
+	TCA_FLOWER_KEY_ARP_SHA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_SHA_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA_MASK,	/* ETH_ALEN */
+
+	TCA_FLOWER_KEY_MPLS_TTL,	/* u8 - 8 bits */
+	TCA_FLOWER_KEY_MPLS_BOS,	/* u8 - 1 bit */
+	TCA_FLOWER_KEY_MPLS_TC,		/* u8 - 3 bits */
+	TCA_FLOWER_KEY_MPLS_LABEL,	/* be32 - 20 bits */
+
+	TCA_FLOWER_KEY_TCP_FLAGS,	/* be16 */
+	TCA_FLOWER_KEY_TCP_FLAGS_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_IP_TOS,		/* u8 */
+	TCA_FLOWER_KEY_IP_TOS_MASK,	/* u8 */
+	TCA_FLOWER_KEY_IP_TTL,		/* u8 */
+	TCA_FLOWER_KEY_IP_TTL_MASK,	/* u8 */
+
+	TCA_FLOWER_KEY_CVLAN_ID,	/* be16 */
+	TCA_FLOWER_KEY_CVLAN_PRIO,	/* u8   */
+	TCA_FLOWER_KEY_CVLAN_ETH_TYPE,	/* be16 */
+
+	TCA_FLOWER_KEY_ENC_IP_TOS,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TOS_MASK,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TTL,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TTL_MASK,	/* u8 */
+
+	TCA_FLOWER_KEY_ENC_OPTS,
+	TCA_FLOWER_KEY_ENC_OPTS_MASK,
+
+	TCA_FLOWER_IN_HW_COUNT,
+
+	__TCA_FLOWER_MAX,
+};
+
+#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
+					 * TCA_FLOWER_KEY_ENC_OPT_GENEVE_
+					 * attributes
+					 */
+	__TCA_FLOWER_KEY_ENC_OPTS_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS,            /* u16 */
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE,             /* u8 */
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA,             /* 4 to 128 bytes */
+
+	__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \
+		(__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
+	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
+};
+
+/* Match-all classifier */
+
+enum {
+	TCA_MATCHALL_UNSPEC,
+	TCA_MATCHALL_CLASSID,
+	TCA_MATCHALL_ACT,
+	TCA_MATCHALL_FLAGS,
+	__TCA_MATCHALL_MAX,
+};
+
+#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1)
+
+/* Extended Matches */
+
+struct tcf_ematch_tree_hdr {
+	__u16		nmatches;
+	__u16		progid;
+};
+
+enum {
+	TCA_EMATCH_TREE_UNSPEC,
+	TCA_EMATCH_TREE_HDR,
+	TCA_EMATCH_TREE_LIST,
+	__TCA_EMATCH_TREE_MAX
+};
+#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1)
+
+struct tcf_ematch_hdr {
+	__u16		matchid;
+	__u16		kind;
+	__u16		flags;
+	__u16		pad; /* currently unused */
+};
+
+/*  0                   1
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+ * +-----------------------+-+-+---+
+ * |         Unused        |S|I| R |
+ * +-----------------------+-+-+---+
+ *
+ * R(2) ::= relation to next ematch
+ *          where: 0 0 END (last ematch)
+ *                 0 1 AND
+ *                 1 0 OR
+ *                 1 1 Unused (invalid)
+ * I(1) ::= invert result
+ * S(1) ::= simple payload
+ */
+#define TCF_EM_REL_END	0
+#define TCF_EM_REL_AND	(1<<0)
+#define TCF_EM_REL_OR	(1<<1)
+#define TCF_EM_INVERT	(1<<2)
+#define TCF_EM_SIMPLE	(1<<3)
+
+#define TCF_EM_REL_MASK	3
+#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK)
+
+enum {
+	TCF_LAYER_LINK,
+	TCF_LAYER_NETWORK,
+	TCF_LAYER_TRANSPORT,
+	__TCF_LAYER_MAX
+};
+#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1)
+
+/* Ematch type assignments
+ *   1..32767		Reserved for ematches inside kernel tree
+ *   32768..65535	Free to use, not reliable
+ */
+#define	TCF_EM_CONTAINER	0
+#define	TCF_EM_CMP		1
+#define	TCF_EM_NBYTE		2
+#define	TCF_EM_U32		3
+#define	TCF_EM_META		4
+#define	TCF_EM_TEXT		5
+#define	TCF_EM_VLAN		6
+#define	TCF_EM_CANID		7
+#define	TCF_EM_IPSET		8
+#define	TCF_EM_IPT		9
+#define	TCF_EM_MAX		9
+
+enum {
+	TCF_EM_PROG_TC
+};
+
+enum {
+	TCF_EM_OPND_EQ,
+	TCF_EM_OPND_GT,
+	TCF_EM_OPND_LT
+};
+
+#endif
diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h
new file mode 100644
index 000000000000..6e89a5df49a4
--- /dev/null
+++ b/tools/include/uapi/linux/tc_act/tc_bpf.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_BPF_H
+#define __LINUX_TC_BPF_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_BPF 13
+
+struct tc_act_bpf {
+	tc_gen;
+};
+
+enum {
+	TCA_ACT_BPF_UNSPEC,
+	TCA_ACT_BPF_TM,
+	TCA_ACT_BPF_PARMS,
+	TCA_ACT_BPF_OPS_LEN,
+	TCA_ACT_BPF_OPS,
+	TCA_ACT_BPF_FD,
+	TCA_ACT_BPF_NAME,
+	TCA_ACT_BPF_PAD,
+	TCA_ACT_BPF_TAG,
+	TCA_ACT_BPF_ID,
+	__TCA_ACT_BPF_MAX,
+};
+#define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
+
+#endif
-- 
Gitee


From 7a186b3d28f9f307cafe5ccd1c089e83eefabe58 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:14 +0800
Subject: [PATCH 53/58] bpf: pull in pkt_sched.h header for tooling to fix 
 bpftool build

ANBZ: #5530

commit ad6dd7a9c47ba587c0aba57f73737764cd31136f upstream

Dan reported that bpftool does not compile for him:

  $ make tools/bpf
    DESCEND  bpf

  Auto-detecting system features:
  ..                        libbfd: [ on  ]
  ..        disassembler-four-args: [ OFF ]

    DESCEND  bpftool

  Auto-detecting system features:
  ..                        libbfd: [ on  ]
  ..        disassembler-four-args: [ OFF ]

    CC       /opt/linux.git/tools/bpf/bpftool/net.o
  In file included from /opt/linux.git/tools/include/uapi/linux/pkt_cls.h:6:0,
                 from /opt/linux.git/tools/include/uapi/linux/tc_act/tc_bpf.h:14,
                 from net.c:13:
  net.c: In function 'show_dev_tc_bpf':
  net.c:164:21: error: 'TC_H_CLSACT' undeclared (first use in this function)
    handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
  [...]

Fix it by importing pkt_sched.h header copy into tooling
infrastructure.

Fixes: 49a249c38726 ("tools/bpftool: copy a few net uapi headers to tools directory")
Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
Reported-by: Dan Gilson <dan_gilson@yahoo.com>
Reference: https://bugzilla.kernel.org/show_bug.cgi?id=202315
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/include/uapi/linux/pkt_sched.h | 1163 ++++++++++++++++++++++++++
 1 file changed, 1163 insertions(+)
 create mode 100644 tools/include/uapi/linux/pkt_sched.h

diff --git a/tools/include/uapi/linux/pkt_sched.h b/tools/include/uapi/linux/pkt_sched.h
new file mode 100644
index 000000000000..25060205bf3f
--- /dev/null
+++ b/tools/include/uapi/linux/pkt_sched.h
@@ -0,0 +1,1163 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_PKT_SCHED_H
+#define __LINUX_PKT_SCHED_H
+
+#include <linux/types.h>
+
+/* Logical priority bands not depending on specific packet scheduler.
+   Every scheduler will map them to real traffic classes, if it has
+   no more precise mechanism to classify packets.
+
+   These numbers have no special meaning, though their coincidence
+   with obsolete IPv6 values is not occasional :-). New IPv6 drafts
+   preferred full anarchy inspired by diffserv group.
+
+   Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy
+   class, actually, as rule it will be handled with more care than
+   filler or even bulk.
+ */
+
+#define TC_PRIO_BESTEFFORT		0
+#define TC_PRIO_FILLER			1
+#define TC_PRIO_BULK			2
+#define TC_PRIO_INTERACTIVE_BULK	4
+#define TC_PRIO_INTERACTIVE		6
+#define TC_PRIO_CONTROL			7
+
+#define TC_PRIO_MAX			15
+
+/* Generic queue statistics, available for all the elements.
+   Particular schedulers may have also their private records.
+ */
+
+struct tc_stats {
+	__u64	bytes;			/* Number of enqueued bytes */
+	__u32	packets;		/* Number of enqueued packets	*/
+	__u32	drops;			/* Packets dropped because of lack of resources */
+	__u32	overlimits;		/* Number of throttle events when this
+					 * flow goes out of allocated bandwidth */
+	__u32	bps;			/* Current flow byte rate */
+	__u32	pps;			/* Current flow packet rate */
+	__u32	qlen;
+	__u32	backlog;
+};
+
+struct tc_estimator {
+	signed char	interval;
+	unsigned char	ewma_log;
+};
+
+/* "Handles"
+   ---------
+
+    All the traffic control objects have 32bit identifiers, or "handles".
+
+    They can be considered as opaque numbers from user API viewpoint,
+    but actually they always consist of two fields: major and
+    minor numbers, which are interpreted by kernel specially,
+    that may be used by applications, though not recommended.
+
+    F.e. qdisc handles always have minor number equal to zero,
+    classes (or flows) have major equal to parent qdisc major, and
+    minor uniquely identifying class inside qdisc.
+
+    Macros to manipulate handles:
+ */
+
+#define TC_H_MAJ_MASK (0xFFFF0000U)
+#define TC_H_MIN_MASK (0x0000FFFFU)
+#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK)
+#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK)
+#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK))
+
+#define TC_H_UNSPEC	(0U)
+#define TC_H_ROOT	(0xFFFFFFFFU)
+#define TC_H_INGRESS    (0xFFFFFFF1U)
+#define TC_H_CLSACT	TC_H_INGRESS
+
+#define TC_H_MIN_PRIORITY	0xFFE0U
+#define TC_H_MIN_INGRESS	0xFFF2U
+#define TC_H_MIN_EGRESS		0xFFF3U
+
+/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */
+enum tc_link_layer {
+	TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */
+	TC_LINKLAYER_ETHERNET,
+	TC_LINKLAYER_ATM,
+};
+#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */
+
+struct tc_ratespec {
+	unsigned char	cell_log;
+	__u8		linklayer; /* lower 4 bits */
+	unsigned short	overhead;
+	short		cell_align;
+	unsigned short	mpu;
+	__u32		rate;
+};
+
+#define TC_RTAB_SIZE	1024
+
+struct tc_sizespec {
+	unsigned char	cell_log;
+	unsigned char	size_log;
+	short		cell_align;
+	int		overhead;
+	unsigned int	linklayer;
+	unsigned int	mpu;
+	unsigned int	mtu;
+	unsigned int	tsize;
+};
+
+enum {
+	TCA_STAB_UNSPEC,
+	TCA_STAB_BASE,
+	TCA_STAB_DATA,
+	__TCA_STAB_MAX
+};
+
+#define TCA_STAB_MAX (__TCA_STAB_MAX - 1)
+
+/* FIFO section */
+
+struct tc_fifo_qopt {
+	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */
+};
+
+/* SKBPRIO section */
+
+/*
+ * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1).
+ * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able
+ * to map one to one the DS field of IPV4 and IPV6 headers.
+ * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY.
+ */
+
+#define SKBPRIO_MAX_PRIORITY 64
+
+struct tc_skbprio_qopt {
+	__u32	limit;		/* Queue length in packets. */
+};
+
+/* PRIO section */
+
+#define TCQ_PRIO_BANDS	16
+#define TCQ_MIN_PRIO_BANDS 2
+
+struct tc_prio_qopt {
+	int	bands;			/* Number of bands */
+	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
+};
+
+/* MULTIQ section */
+
+struct tc_multiq_qopt {
+	__u16	bands;			/* Number of bands */
+	__u16	max_bands;		/* Maximum number of queues */
+};
+
+/* PLUG section */
+
+#define TCQ_PLUG_BUFFER                0
+#define TCQ_PLUG_RELEASE_ONE           1
+#define TCQ_PLUG_RELEASE_INDEFINITE    2
+#define TCQ_PLUG_LIMIT                 3
+
+struct tc_plug_qopt {
+	/* TCQ_PLUG_BUFFER: Inset a plug into the queue and
+	 *  buffer any incoming packets
+	 * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
+	 *   to beginning of the next plug.
+	 * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
+	 *   Stop buffering packets until the next TCQ_PLUG_BUFFER
+	 *   command is received (just act as a pass-thru queue).
+	 * TCQ_PLUG_LIMIT: Increase/decrease queue size
+	 */
+	int             action;
+	__u32           limit;
+};
+
+/* TBF section */
+
+struct tc_tbf_qopt {
+	struct tc_ratespec rate;
+	struct tc_ratespec peakrate;
+	__u32		limit;
+	__u32		buffer;
+	__u32		mtu;
+};
+
+enum {
+	TCA_TBF_UNSPEC,
+	TCA_TBF_PARMS,
+	TCA_TBF_RTAB,
+	TCA_TBF_PTAB,
+	TCA_TBF_RATE64,
+	TCA_TBF_PRATE64,
+	TCA_TBF_BURST,
+	TCA_TBF_PBURST,
+	TCA_TBF_PAD,
+	__TCA_TBF_MAX,
+};
+
+#define TCA_TBF_MAX (__TCA_TBF_MAX - 1)
+
+
+/* TEQL section */
+
+/* TEQL does not require any parameters */
+
+/* SFQ section */
+
+struct tc_sfq_qopt {
+	unsigned	quantum;	/* Bytes per round allocated to flow */
+	int		perturb_period;	/* Period of hash perturbation */
+	__u32		limit;		/* Maximal packets in queue */
+	unsigned	divisor;	/* Hash divisor  */
+	unsigned	flows;		/* Maximal number of flows  */
+};
+
+struct tc_sfqred_stats {
+	__u32           prob_drop;      /* Early drops, below max threshold */
+	__u32           forced_drop;	/* Early drops, after max threshold */
+	__u32           prob_mark;      /* Marked packets, below max threshold */
+	__u32           forced_mark;    /* Marked packets, after max threshold */
+	__u32           prob_mark_head; /* Marked packets, below max threshold */
+	__u32           forced_mark_head;/* Marked packets, after max threshold */
+};
+
+struct tc_sfq_qopt_v1 {
+	struct tc_sfq_qopt v0;
+	unsigned int	depth;		/* max number of packets per flow */
+	unsigned int	headdrop;
+/* SFQRED parameters */
+	__u32		limit;		/* HARD maximal flow queue length (bytes) */
+	__u32		qth_min;	/* Min average length threshold (bytes) */
+	__u32		qth_max;	/* Max average length threshold (bytes) */
+	unsigned char   Wlog;		/* log(W)		*/
+	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
+	unsigned char   Scell_log;	/* cell size for idle damping */
+	unsigned char	flags;
+	__u32		max_P;		/* probability, high resolution */
+/* SFQRED stats */
+	struct tc_sfqred_stats stats;
+};
+
+
+struct tc_sfq_xstats {
+	__s32		allot;
+};
+
+/* RED section */
+
+enum {
+	TCA_RED_UNSPEC,
+	TCA_RED_PARMS,
+	TCA_RED_STAB,
+	TCA_RED_MAX_P,
+	__TCA_RED_MAX,
+};
+
+#define TCA_RED_MAX (__TCA_RED_MAX - 1)
+
+struct tc_red_qopt {
+	__u32		limit;		/* HARD maximal queue length (bytes)	*/
+	__u32		qth_min;	/* Min average length threshold (bytes) */
+	__u32		qth_max;	/* Max average length threshold (bytes) */
+	unsigned char   Wlog;		/* log(W)		*/
+	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
+	unsigned char   Scell_log;	/* cell size for idle damping */
+	unsigned char	flags;
+#define TC_RED_ECN		1
+#define TC_RED_HARDDROP		2
+#define TC_RED_ADAPTATIVE	4
+};
+
+struct tc_red_xstats {
+	__u32           early;          /* Early drops */
+	__u32           pdrop;          /* Drops due to queue limits */
+	__u32           other;          /* Drops due to drop() calls */
+	__u32           marked;         /* Marked packets */
+};
+
+/* GRED section */
+
+#define MAX_DPs 16
+
+enum {
+       TCA_GRED_UNSPEC,
+       TCA_GRED_PARMS,
+       TCA_GRED_STAB,
+       TCA_GRED_DPS,
+       TCA_GRED_MAX_P,
+       TCA_GRED_LIMIT,
+       TCA_GRED_VQ_LIST,	/* nested TCA_GRED_VQ_ENTRY */
+       __TCA_GRED_MAX,
+};
+
+#define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
+
+enum {
+	TCA_GRED_VQ_ENTRY_UNSPEC,
+	TCA_GRED_VQ_ENTRY,	/* nested TCA_GRED_VQ_* */
+	__TCA_GRED_VQ_ENTRY_MAX,
+};
+#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1)
+
+enum {
+	TCA_GRED_VQ_UNSPEC,
+	TCA_GRED_VQ_PAD,
+	TCA_GRED_VQ_DP,			/* u32 */
+	TCA_GRED_VQ_STAT_BYTES,		/* u64 */
+	TCA_GRED_VQ_STAT_PACKETS,	/* u32 */
+	TCA_GRED_VQ_STAT_BACKLOG,	/* u32 */
+	TCA_GRED_VQ_STAT_PROB_DROP,	/* u32 */
+	TCA_GRED_VQ_STAT_PROB_MARK,	/* u32 */
+	TCA_GRED_VQ_STAT_FORCED_DROP,	/* u32 */
+	TCA_GRED_VQ_STAT_FORCED_MARK,	/* u32 */
+	TCA_GRED_VQ_STAT_PDROP,		/* u32 */
+	TCA_GRED_VQ_STAT_OTHER,		/* u32 */
+	TCA_GRED_VQ_FLAGS,		/* u32 */
+	__TCA_GRED_VQ_MAX
+};
+
+#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1)
+
+struct tc_gred_qopt {
+	__u32		limit;        /* HARD maximal queue length (bytes)    */
+	__u32		qth_min;      /* Min average length threshold (bytes) */
+	__u32		qth_max;      /* Max average length threshold (bytes) */
+	__u32		DP;           /* up to 2^32 DPs */
+	__u32		backlog;
+	__u32		qave;
+	__u32		forced;
+	__u32		early;
+	__u32		other;
+	__u32		pdrop;
+	__u8		Wlog;         /* log(W)               */
+	__u8		Plog;         /* log(P_max/(qth_max-qth_min)) */
+	__u8		Scell_log;    /* cell size for idle damping */
+	__u8		prio;         /* prio of this VQ */
+	__u32		packets;
+	__u32		bytesin;
+};
+
+/* gred setup */
+struct tc_gred_sopt {
+	__u32		DPs;
+	__u32		def_DP;
+	__u8		grio;
+	__u8		flags;
+	__u16		pad1;
+};
+
+/* CHOKe section */
+
+enum {
+	TCA_CHOKE_UNSPEC,
+	TCA_CHOKE_PARMS,
+	TCA_CHOKE_STAB,
+	TCA_CHOKE_MAX_P,
+	__TCA_CHOKE_MAX,
+};
+
+#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
+
+struct tc_choke_qopt {
+	__u32		limit;		/* Hard queue length (packets)	*/
+	__u32		qth_min;	/* Min average threshold (packets) */
+	__u32		qth_max;	/* Max average threshold (packets) */
+	unsigned char   Wlog;		/* log(W)		*/
+	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
+	unsigned char   Scell_log;	/* cell size for idle damping */
+	unsigned char	flags;		/* see RED flags */
+};
+
+struct tc_choke_xstats {
+	__u32		early;          /* Early drops */
+	__u32		pdrop;          /* Drops due to queue limits */
+	__u32		other;          /* Drops due to drop() calls */
+	__u32		marked;         /* Marked packets */
+	__u32		matched;	/* Drops due to flow match */
+};
+
+/* HTB section */
+#define TC_HTB_NUMPRIO		8
+#define TC_HTB_MAXDEPTH		8
+#define TC_HTB_PROTOVER		3 /* the same as HTB and TC's major */
+
+struct tc_htb_opt {
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32	buffer;
+	__u32	cbuffer;
+	__u32	quantum;
+	__u32	level;		/* out only */
+	__u32	prio;
+};
+struct tc_htb_glob {
+	__u32 version;		/* to match HTB/TC */
+	__u32 rate2quantum;	/* bps->quantum divisor */
+	__u32 defcls;		/* default class number */
+	__u32 debug;		/* debug flags */
+
+	/* stats */
+	__u32 direct_pkts; /* count of non shaped packets */
+};
+enum {
+	TCA_HTB_UNSPEC,
+	TCA_HTB_PARMS,
+	TCA_HTB_INIT,
+	TCA_HTB_CTAB,
+	TCA_HTB_RTAB,
+	TCA_HTB_DIRECT_QLEN,
+	TCA_HTB_RATE64,
+	TCA_HTB_CEIL64,
+	TCA_HTB_PAD,
+	__TCA_HTB_MAX,
+};
+
+#define TCA_HTB_MAX (__TCA_HTB_MAX - 1)
+
+struct tc_htb_xstats {
+	__u32 lends;
+	__u32 borrows;
+	__u32 giants;	/* unused since 'Make HTB scheduler work with TSO.' */
+	__s32 tokens;
+	__s32 ctokens;
+};
+
+/* HFSC section */
+
+struct tc_hfsc_qopt {
+	__u16	defcls;		/* default class */
+};
+
+struct tc_service_curve {
+	__u32	m1;		/* slope of the first segment in bps */
+	__u32	d;		/* x-projection of the first segment in us */
+	__u32	m2;		/* slope of the second segment in bps */
+};
+
+struct tc_hfsc_stats {
+	__u64	work;		/* total work done */
+	__u64	rtwork;		/* work done by real-time criteria */
+	__u32	period;		/* current period */
+	__u32	level;		/* class level in hierarchy */
+};
+
+enum {
+	TCA_HFSC_UNSPEC,
+	TCA_HFSC_RSC,
+	TCA_HFSC_FSC,
+	TCA_HFSC_USC,
+	__TCA_HFSC_MAX,
+};
+
+#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1)
+
+
+/* CBQ section */
+
+#define TC_CBQ_MAXPRIO		8
+#define TC_CBQ_MAXLEVEL		8
+#define TC_CBQ_DEF_EWMA		5
+
+struct tc_cbq_lssopt {
+	unsigned char	change;
+	unsigned char	flags;
+#define TCF_CBQ_LSS_BOUNDED	1
+#define TCF_CBQ_LSS_ISOLATED	2
+	unsigned char  	ewma_log;
+	unsigned char  	level;
+#define TCF_CBQ_LSS_FLAGS	1
+#define TCF_CBQ_LSS_EWMA	2
+#define TCF_CBQ_LSS_MAXIDLE	4
+#define TCF_CBQ_LSS_MINIDLE	8
+#define TCF_CBQ_LSS_OFFTIME	0x10
+#define TCF_CBQ_LSS_AVPKT	0x20
+	__u32		maxidle;
+	__u32		minidle;
+	__u32		offtime;
+	__u32		avpkt;
+};
+
+struct tc_cbq_wrropt {
+	unsigned char	flags;
+	unsigned char	priority;
+	unsigned char	cpriority;
+	unsigned char	__reserved;
+	__u32		allot;
+	__u32		weight;
+};
+
+struct tc_cbq_ovl {
+	unsigned char	strategy;
+#define	TC_CBQ_OVL_CLASSIC	0
+#define	TC_CBQ_OVL_DELAY	1
+#define	TC_CBQ_OVL_LOWPRIO	2
+#define	TC_CBQ_OVL_DROP		3
+#define	TC_CBQ_OVL_RCLASSIC	4
+	unsigned char	priority2;
+	__u16		pad;
+	__u32		penalty;
+};
+
+struct tc_cbq_police {
+	unsigned char	police;
+	unsigned char	__res1;
+	unsigned short	__res2;
+};
+
+struct tc_cbq_fopt {
+	__u32		split;
+	__u32		defmap;
+	__u32		defchange;
+};
+
+struct tc_cbq_xstats {
+	__u32		borrows;
+	__u32		overactions;
+	__s32		avgidle;
+	__s32		undertime;
+};
+
+enum {
+	TCA_CBQ_UNSPEC,
+	TCA_CBQ_LSSOPT,
+	TCA_CBQ_WRROPT,
+	TCA_CBQ_FOPT,
+	TCA_CBQ_OVL_STRATEGY,
+	TCA_CBQ_RATE,
+	TCA_CBQ_RTAB,
+	TCA_CBQ_POLICE,
+	__TCA_CBQ_MAX,
+};
+
+#define TCA_CBQ_MAX	(__TCA_CBQ_MAX - 1)
+
+/* dsmark section */
+
+enum {
+	TCA_DSMARK_UNSPEC,
+	TCA_DSMARK_INDICES,
+	TCA_DSMARK_DEFAULT_INDEX,
+	TCA_DSMARK_SET_TC_INDEX,
+	TCA_DSMARK_MASK,
+	TCA_DSMARK_VALUE,
+	__TCA_DSMARK_MAX,
+};
+
+#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1)
+
+/* ATM  section */
+
+enum {
+	TCA_ATM_UNSPEC,
+	TCA_ATM_FD,		/* file/socket descriptor */
+	TCA_ATM_PTR,		/* pointer to descriptor - later */
+	TCA_ATM_HDR,		/* LL header */
+	TCA_ATM_EXCESS,		/* excess traffic class (0 for CLP)  */
+	TCA_ATM_ADDR,		/* PVC address (for output only) */
+	TCA_ATM_STATE,		/* VC state (ATM_VS_*; for output only) */
+	__TCA_ATM_MAX,
+};
+
+#define TCA_ATM_MAX	(__TCA_ATM_MAX - 1)
+
+/* Network emulator */
+
+enum {
+	TCA_NETEM_UNSPEC,
+	TCA_NETEM_CORR,
+	TCA_NETEM_DELAY_DIST,
+	TCA_NETEM_REORDER,
+	TCA_NETEM_CORRUPT,
+	TCA_NETEM_LOSS,
+	TCA_NETEM_RATE,
+	TCA_NETEM_ECN,
+	TCA_NETEM_RATE64,
+	TCA_NETEM_PAD,
+	TCA_NETEM_LATENCY64,
+	TCA_NETEM_JITTER64,
+	TCA_NETEM_SLOT,
+	TCA_NETEM_SLOT_DIST,
+	__TCA_NETEM_MAX,
+};
+
+#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1)
+
+struct tc_netem_qopt {
+	__u32	latency;	/* added delay (us) */
+	__u32   limit;		/* fifo limit (packets) */
+	__u32	loss;		/* random packet loss (0=none ~0=100%) */
+	__u32	gap;		/* re-ordering gap (0 for none) */
+	__u32   duplicate;	/* random packet dup  (0=none ~0=100%) */
+	__u32	jitter;		/* random jitter in latency (us) */
+};
+
+struct tc_netem_corr {
+	__u32	delay_corr;	/* delay correlation */
+	__u32	loss_corr;	/* packet loss correlation */
+	__u32	dup_corr;	/* duplicate correlation  */
+};
+
+struct tc_netem_reorder {
+	__u32	probability;
+	__u32	correlation;
+};
+
+struct tc_netem_corrupt {
+	__u32	probability;
+	__u32	correlation;
+};
+
+struct tc_netem_rate {
+	__u32	rate;	/* byte/s */
+	__s32	packet_overhead;
+	__u32	cell_size;
+	__s32	cell_overhead;
+};
+
+struct tc_netem_slot {
+	__s64   min_delay; /* nsec */
+	__s64   max_delay;
+	__s32   max_packets;
+	__s32   max_bytes;
+	__s64	dist_delay; /* nsec */
+	__s64	dist_jitter; /* nsec */
+};
+
+enum {
+	NETEM_LOSS_UNSPEC,
+	NETEM_LOSS_GI,		/* General Intuitive - 4 state model */
+	NETEM_LOSS_GE,		/* Gilbert Elliot models */
+	__NETEM_LOSS_MAX
+};
+#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1)
+
+/* State transition probabilities for 4 state model */
+struct tc_netem_gimodel {
+	__u32	p13;
+	__u32	p31;
+	__u32	p32;
+	__u32	p14;
+	__u32	p23;
+};
+
+/* Gilbert-Elliot models */
+struct tc_netem_gemodel {
+	__u32 p;
+	__u32 r;
+	__u32 h;
+	__u32 k1;
+};
+
+#define NETEM_DIST_SCALE	8192
+#define NETEM_DIST_MAX		16384
+
+/* DRR */
+
+enum {
+	TCA_DRR_UNSPEC,
+	TCA_DRR_QUANTUM,
+	__TCA_DRR_MAX
+};
+
+#define TCA_DRR_MAX	(__TCA_DRR_MAX - 1)
+
+struct tc_drr_stats {
+	__u32	deficit;
+};
+
+/* MQPRIO */
+#define TC_QOPT_BITMASK 15
+#define TC_QOPT_MAX_QUEUE 16
+
+enum {
+	TC_MQPRIO_HW_OFFLOAD_NONE,	/* no offload requested */
+	TC_MQPRIO_HW_OFFLOAD_TCS,	/* offload TCs, no queue counts */
+	__TC_MQPRIO_HW_OFFLOAD_MAX
+};
+
+#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
+
+enum {
+	TC_MQPRIO_MODE_DCB,
+	TC_MQPRIO_MODE_CHANNEL,
+	__TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+	TC_MQPRIO_SHAPER_DCB,
+	TC_MQPRIO_SHAPER_BW_RATE,	/* Add new shapers below */
+	__TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
+struct tc_mqprio_qopt {
+	__u8	num_tc;
+	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
+	__u8	hw;
+	__u16	count[TC_QOPT_MAX_QUEUE];
+	__u16	offset[TC_QOPT_MAX_QUEUE];
+};
+
+#define TC_MQPRIO_F_MODE		0x1
+#define TC_MQPRIO_F_SHAPER		0x2
+#define TC_MQPRIO_F_MIN_RATE		0x4
+#define TC_MQPRIO_F_MAX_RATE		0x8
+
+enum {
+	TCA_MQPRIO_UNSPEC,
+	TCA_MQPRIO_MODE,
+	TCA_MQPRIO_SHAPER,
+	TCA_MQPRIO_MIN_RATE64,
+	TCA_MQPRIO_MAX_RATE64,
+	__TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
+/* SFB */
+
+enum {
+	TCA_SFB_UNSPEC,
+	TCA_SFB_PARMS,
+	__TCA_SFB_MAX,
+};
+
+#define TCA_SFB_MAX (__TCA_SFB_MAX - 1)
+
+/*
+ * Note: increment, decrement are Q0.16 fixed-point values.
+ */
+struct tc_sfb_qopt {
+	__u32 rehash_interval;	/* delay between hash move, in ms */
+	__u32 warmup_time;	/* double buffering warmup time in ms (warmup_time < rehash_interval) */
+	__u32 max;		/* max len of qlen_min */
+	__u32 bin_size;		/* maximum queue length per bin */
+	__u32 increment;	/* probability increment, (d1 in Blue) */
+	__u32 decrement;	/* probability decrement, (d2 in Blue) */
+	__u32 limit;		/* max SFB queue length */
+	__u32 penalty_rate;	/* inelastic flows are rate limited to 'rate' pps */
+	__u32 penalty_burst;
+};
+
+struct tc_sfb_xstats {
+	__u32 earlydrop;
+	__u32 penaltydrop;
+	__u32 bucketdrop;
+	__u32 queuedrop;
+	__u32 childdrop; /* drops in child qdisc */
+	__u32 marked;
+	__u32 maxqlen;
+	__u32 maxprob;
+	__u32 avgprob;
+};
+
+#define SFB_MAX_PROB 0xFFFF
+
+/* QFQ */
+enum {
+	TCA_QFQ_UNSPEC,
+	TCA_QFQ_WEIGHT,
+	TCA_QFQ_LMAX,
+	__TCA_QFQ_MAX
+};
+
+#define TCA_QFQ_MAX	(__TCA_QFQ_MAX - 1)
+
+struct tc_qfq_stats {
+	__u32 weight;
+	__u32 lmax;
+};
+
+/* CODEL */
+
+enum {
+	TCA_CODEL_UNSPEC,
+	TCA_CODEL_TARGET,
+	TCA_CODEL_LIMIT,
+	TCA_CODEL_INTERVAL,
+	TCA_CODEL_ECN,
+	TCA_CODEL_CE_THRESHOLD,
+	__TCA_CODEL_MAX
+};
+
+#define TCA_CODEL_MAX	(__TCA_CODEL_MAX - 1)
+
+struct tc_codel_xstats {
+	__u32	maxpacket; /* largest packet we've seen so far */
+	__u32	count;	   /* how many drops we've done since the last time we
+			    * entered dropping state
+			    */
+	__u32	lastcount; /* count at entry to dropping state */
+	__u32	ldelay;    /* in-queue delay seen by most recently dequeued packet */
+	__s32	drop_next; /* time to drop next packet */
+	__u32	drop_overlimit; /* number of time max qdisc packet limit was hit */
+	__u32	ecn_mark;  /* number of packets we ECN marked instead of dropped */
+	__u32	dropping;  /* are we in dropping state ? */
+	__u32	ce_mark;   /* number of CE marked packets because of ce_threshold */
+};
+
+/* FQ_CODEL */
+
+enum {
+	TCA_FQ_CODEL_UNSPEC,
+	TCA_FQ_CODEL_TARGET,
+	TCA_FQ_CODEL_LIMIT,
+	TCA_FQ_CODEL_INTERVAL,
+	TCA_FQ_CODEL_ECN,
+	TCA_FQ_CODEL_FLOWS,
+	TCA_FQ_CODEL_QUANTUM,
+	TCA_FQ_CODEL_CE_THRESHOLD,
+	TCA_FQ_CODEL_DROP_BATCH_SIZE,
+	TCA_FQ_CODEL_MEMORY_LIMIT,
+	__TCA_FQ_CODEL_MAX
+};
+
+#define TCA_FQ_CODEL_MAX	(__TCA_FQ_CODEL_MAX - 1)
+
+enum {
+	TCA_FQ_CODEL_XSTATS_QDISC,
+	TCA_FQ_CODEL_XSTATS_CLASS,
+};
+
+struct tc_fq_codel_qd_stats {
+	__u32	maxpacket;	/* largest packet we've seen so far */
+	__u32	drop_overlimit; /* number of time max qdisc
+				 * packet limit was hit
+				 */
+	__u32	ecn_mark;	/* number of packets we ECN marked
+				 * instead of being dropped
+				 */
+	__u32	new_flow_count; /* number of time packets
+				 * created a 'new flow'
+				 */
+	__u32	new_flows_len;	/* count of flows in new list */
+	__u32	old_flows_len;	/* count of flows in old list */
+	__u32	ce_mark;	/* packets above ce_threshold */
+	__u32	memory_usage;	/* in bytes */
+	__u32	drop_overmemory;
+};
+
+struct tc_fq_codel_cl_stats {
+	__s32	deficit;
+	__u32	ldelay;		/* in-queue delay seen by most recently
+				 * dequeued packet
+				 */
+	__u32	count;
+	__u32	lastcount;
+	__u32	dropping;
+	__s32	drop_next;
+};
+
+struct tc_fq_codel_xstats {
+	__u32	type;
+	union {
+		struct tc_fq_codel_qd_stats qdisc_stats;
+		struct tc_fq_codel_cl_stats class_stats;
+	};
+};
+
+/* FQ */
+
+enum {
+	TCA_FQ_UNSPEC,
+
+	TCA_FQ_PLIMIT,		/* limit of total number of packets in queue */
+
+	TCA_FQ_FLOW_PLIMIT,	/* limit of packets per flow */
+
+	TCA_FQ_QUANTUM,		/* RR quantum */
+
+	TCA_FQ_INITIAL_QUANTUM,		/* RR quantum for new flow */
+
+	TCA_FQ_RATE_ENABLE,	/* enable/disable rate limiting */
+
+	TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */
+
+	TCA_FQ_FLOW_MAX_RATE,	/* per flow max rate */
+
+	TCA_FQ_BUCKETS_LOG,	/* log2(number of buckets) */
+
+	TCA_FQ_FLOW_REFILL_DELAY,	/* flow credit refill delay in usec */
+
+	TCA_FQ_ORPHAN_MASK,	/* mask applied to orphaned skb hashes */
+
+	TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
+
+	TCA_FQ_CE_THRESHOLD,	/* DCTCP-like CE-marking threshold */
+
+	__TCA_FQ_MAX
+};
+
+#define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
+
+struct tc_fq_qd_stats {
+	__u64	gc_flows;
+	__u64	highprio_packets;
+	__u64	tcp_retrans;
+	__u64	throttled;
+	__u64	flows_plimit;
+	__u64	pkts_too_long;
+	__u64	allocation_errors;
+	__s64	time_next_delayed_flow;
+	__u32	flows;
+	__u32	inactive_flows;
+	__u32	throttled_flows;
+	__u32	unthrottle_latency_ns;
+	__u64	ce_mark;		/* packets above ce_threshold */
+};
+
+/* Heavy-Hitter Filter */
+
+enum {
+	TCA_HHF_UNSPEC,
+	TCA_HHF_BACKLOG_LIMIT,
+	TCA_HHF_QUANTUM,
+	TCA_HHF_HH_FLOWS_LIMIT,
+	TCA_HHF_RESET_TIMEOUT,
+	TCA_HHF_ADMIT_BYTES,
+	TCA_HHF_EVICT_TIMEOUT,
+	TCA_HHF_NON_HH_WEIGHT,
+	__TCA_HHF_MAX
+};
+
+#define TCA_HHF_MAX	(__TCA_HHF_MAX - 1)
+
+struct tc_hhf_xstats {
+	__u32	drop_overlimit; /* number of times max qdisc packet limit
+				 * was hit
+				 */
+	__u32	hh_overlimit;   /* number of times max heavy-hitters was hit */
+	__u32	hh_tot_count;   /* number of captured heavy-hitters so far */
+	__u32	hh_cur_count;   /* number of current heavy-hitters */
+};
+
+/* PIE */
+enum {
+	TCA_PIE_UNSPEC,
+	TCA_PIE_TARGET,
+	TCA_PIE_LIMIT,
+	TCA_PIE_TUPDATE,
+	TCA_PIE_ALPHA,
+	TCA_PIE_BETA,
+	TCA_PIE_ECN,
+	TCA_PIE_BYTEMODE,
+	__TCA_PIE_MAX
+};
+#define TCA_PIE_MAX   (__TCA_PIE_MAX - 1)
+
+struct tc_pie_xstats {
+	__u32 prob;             /* current probability */
+	__u32 delay;            /* current delay in ms */
+	__u32 avg_dq_rate;      /* current average dq_rate in bits/pie_time */
+	__u32 packets_in;       /* total number of packets enqueued */
+	__u32 dropped;          /* packets dropped due to pie_action */
+	__u32 overlimit;        /* dropped due to lack of space in queue */
+	__u32 maxq;             /* maximum queue size */
+	__u32 ecn_mark;         /* packets marked with ecn*/
+};
+
+/* CBS */
+struct tc_cbs_qopt {
+	__u8 offload;
+	__u8 _pad[3];
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
+
+/* ETF */
+struct tc_etf_qopt {
+	__s32 delta;
+	__s32 clockid;
+	__u32 flags;
+#define TC_ETF_DEADLINE_MODE_ON	BIT(0)
+#define TC_ETF_OFFLOAD_ON	BIT(1)
+};
+
+enum {
+	TCA_ETF_UNSPEC,
+	TCA_ETF_PARMS,
+	__TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
+
+/* CAKE */
+enum {
+	TCA_CAKE_UNSPEC,
+	TCA_CAKE_PAD,
+	TCA_CAKE_BASE_RATE64,
+	TCA_CAKE_DIFFSERV_MODE,
+	TCA_CAKE_ATM,
+	TCA_CAKE_FLOW_MODE,
+	TCA_CAKE_OVERHEAD,
+	TCA_CAKE_RTT,
+	TCA_CAKE_TARGET,
+	TCA_CAKE_AUTORATE,
+	TCA_CAKE_MEMORY,
+	TCA_CAKE_NAT,
+	TCA_CAKE_RAW,
+	TCA_CAKE_WASH,
+	TCA_CAKE_MPU,
+	TCA_CAKE_INGRESS,
+	TCA_CAKE_ACK_FILTER,
+	TCA_CAKE_SPLIT_GSO,
+	__TCA_CAKE_MAX
+};
+#define TCA_CAKE_MAX	(__TCA_CAKE_MAX - 1)
+
+enum {
+	__TCA_CAKE_STATS_INVALID,
+	TCA_CAKE_STATS_PAD,
+	TCA_CAKE_STATS_CAPACITY_ESTIMATE64,
+	TCA_CAKE_STATS_MEMORY_LIMIT,
+	TCA_CAKE_STATS_MEMORY_USED,
+	TCA_CAKE_STATS_AVG_NETOFF,
+	TCA_CAKE_STATS_MIN_NETLEN,
+	TCA_CAKE_STATS_MAX_NETLEN,
+	TCA_CAKE_STATS_MIN_ADJLEN,
+	TCA_CAKE_STATS_MAX_ADJLEN,
+	TCA_CAKE_STATS_TIN_STATS,
+	TCA_CAKE_STATS_DEFICIT,
+	TCA_CAKE_STATS_COBALT_COUNT,
+	TCA_CAKE_STATS_DROPPING,
+	TCA_CAKE_STATS_DROP_NEXT_US,
+	TCA_CAKE_STATS_P_DROP,
+	TCA_CAKE_STATS_BLUE_TIMER_US,
+	__TCA_CAKE_STATS_MAX
+};
+#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1)
+
+enum {
+	__TCA_CAKE_TIN_STATS_INVALID,
+	TCA_CAKE_TIN_STATS_PAD,
+	TCA_CAKE_TIN_STATS_SENT_PACKETS,
+	TCA_CAKE_TIN_STATS_SENT_BYTES64,
+	TCA_CAKE_TIN_STATS_DROPPED_PACKETS,
+	TCA_CAKE_TIN_STATS_DROPPED_BYTES64,
+	TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS,
+	TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64,
+	TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS,
+	TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64,
+	TCA_CAKE_TIN_STATS_BACKLOG_PACKETS,
+	TCA_CAKE_TIN_STATS_BACKLOG_BYTES,
+	TCA_CAKE_TIN_STATS_THRESHOLD_RATE64,
+	TCA_CAKE_TIN_STATS_TARGET_US,
+	TCA_CAKE_TIN_STATS_INTERVAL_US,
+	TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS,
+	TCA_CAKE_TIN_STATS_WAY_MISSES,
+	TCA_CAKE_TIN_STATS_WAY_COLLISIONS,
+	TCA_CAKE_TIN_STATS_PEAK_DELAY_US,
+	TCA_CAKE_TIN_STATS_AVG_DELAY_US,
+	TCA_CAKE_TIN_STATS_BASE_DELAY_US,
+	TCA_CAKE_TIN_STATS_SPARSE_FLOWS,
+	TCA_CAKE_TIN_STATS_BULK_FLOWS,
+	TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS,
+	TCA_CAKE_TIN_STATS_MAX_SKBLEN,
+	TCA_CAKE_TIN_STATS_FLOW_QUANTUM,
+	__TCA_CAKE_TIN_STATS_MAX
+};
+#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1)
+#define TC_CAKE_MAX_TINS (8)
+
+enum {
+	CAKE_FLOW_NONE = 0,
+	CAKE_FLOW_SRC_IP,
+	CAKE_FLOW_DST_IP,
+	CAKE_FLOW_HOSTS,    /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */
+	CAKE_FLOW_FLOWS,
+	CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */
+	CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */
+	CAKE_FLOW_TRIPLE,   /* = CAKE_FLOW_HOSTS  | CAKE_FLOW_FLOWS */
+	CAKE_FLOW_MAX,
+};
+
+enum {
+	CAKE_DIFFSERV_DIFFSERV3 = 0,
+	CAKE_DIFFSERV_DIFFSERV4,
+	CAKE_DIFFSERV_DIFFSERV8,
+	CAKE_DIFFSERV_BESTEFFORT,
+	CAKE_DIFFSERV_PRECEDENCE,
+	CAKE_DIFFSERV_MAX
+};
+
+enum {
+	CAKE_ACK_NONE = 0,
+	CAKE_ACK_FILTER,
+	CAKE_ACK_AGGRESSIVE,
+	CAKE_ACK_MAX
+};
+
+enum {
+	CAKE_ATM_NONE = 0,
+	CAKE_ATM_ATM,
+	CAKE_ATM_PTM,
+	CAKE_ATM_MAX
+};
+
+
+/* TAPRIO */
+enum {
+	TC_TAPRIO_CMD_SET_GATES = 0x00,
+	TC_TAPRIO_CMD_SET_AND_HOLD = 0x01,
+	TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02,
+};
+
+enum {
+	TCA_TAPRIO_SCHED_ENTRY_UNSPEC,
+	TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */
+	TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */
+	TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */
+	TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */
+	__TCA_TAPRIO_SCHED_ENTRY_MAX,
+};
+#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1)
+
+/* The format for schedule entry list is:
+ * [TCA_TAPRIO_SCHED_ENTRY_LIST]
+ *   [TCA_TAPRIO_SCHED_ENTRY]
+ *     [TCA_TAPRIO_SCHED_ENTRY_CMD]
+ *     [TCA_TAPRIO_SCHED_ENTRY_GATES]
+ *     [TCA_TAPRIO_SCHED_ENTRY_INTERVAL]
+ */
+enum {
+	TCA_TAPRIO_SCHED_UNSPEC,
+	TCA_TAPRIO_SCHED_ENTRY,
+	__TCA_TAPRIO_SCHED_MAX,
+};
+
+#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1)
+
+enum {
+	TCA_TAPRIO_ATTR_UNSPEC,
+	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
+	TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */
+	TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */
+	TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */
+	TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
+	TCA_TAPRIO_PAD,
+	__TCA_TAPRIO_ATTR_MAX,
+};
+
+#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1)
+
+#endif
-- 
Gitee


From 313f0df54730f12e1b2bb67f26584d09a75c8771 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:15 +0800
Subject: [PATCH 54/58] bpftool: Fix compilation failure for net.o with older 
 glibc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ANBZ: #5530

commit 6f02b540d7597f357bc6ee711346761045d4e108 upstream

For older glibc ~2.17, #include'ing both linux/if.h and net/if.h
fails due to complaints about redefinition of interface flags:

  CC       net.o
In file included from net.c:13:0:
/usr/include/linux/if.h:71:2: error: redeclaration of enumerator ‘IFF_UP’
  IFF_UP    = 1<<0,  /* sysfs */
  ^
/usr/include/net/if.h:44:5: note: previous definition of ‘IFF_UP’ was here
     IFF_UP = 0x1,  /* Interface is up.  */

The issue was fixed in kernel headers in [1], but since compilation
of net.c picks up system headers the problem can recur.

Dropping #include <linux/if.h> resolves the issue and it is
not needed for compilation anyhow.

[1] https://lore.kernel.org/netdev/1461512707-23058-1-git-send-email-mikko.rapeli__34748.27880641$1462831734$gmane$org@iki.fi/

Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/1609948746-15369-1-git-send-email-alan.maguire@oracle.com
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/bpf/bpftool/net.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index ed205ee57655..63b47b8e5e41 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -8,7 +8,6 @@
 #include <unistd.h>
 #include <libbpf.h>
 #include <net/if.h>
-#include <linux/if.h>
 #include <linux/rtnetlink.h>
 #include <linux/tc_act/tc_bpf.h>
 #include <sys/socket.h>
-- 
Gitee


From 11222a3701d7b820435ada8a06cb21fe7fb2989b Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:15 +0800
Subject: [PATCH 55/58] bpf: add bpffs pretty print for program array map

ANBZ: #5530

commit a7c19db38d62fc1ce797dba19936e9f81cf2b9fb upstream

Added bpffs pretty print for program array map. For a particular
array index, if the program array points to a valid program,
the "<index>: <prog_id>" will be printed out like
   0: 6
which means bpf program with id "6" is installed at index "0".

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 kernel/bpf/arraymap.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0c17aab3ce5f..1b2cccff5983 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -529,6 +529,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map)
 		fd_array_map_delete_elem(map, &i);
 }
 
+static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
+					 struct seq_file *m)
+{
+	void **elem, *ptr;
+	u32 prog_id;
+
+	rcu_read_lock();
+
+	elem = array_map_lookup_elem(map, key);
+	if (elem) {
+		ptr = READ_ONCE(*elem);
+		if (ptr) {
+			seq_printf(m, "%u: ", *(u32 *)key);
+			prog_id = prog_fd_array_sys_lookup_elem(ptr);
+			btf_type_seq_show(map->btf, map->btf_value_type_id,
+					  &prog_id, m);
+			seq_puts(m, "\n");
+		}
+	}
+
+	rcu_read_unlock();
+}
+
 const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -540,7 +563,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
 	.map_release_uref = bpf_fd_array_map_clear,
-	.map_check_btf = map_check_no_btf,
+	.map_seq_show_elem = prog_array_map_seq_show_elem,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
-- 
Gitee


From 53cae1fca0d64309a3a0a0ca467bc286b39c0a36 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:16 +0800
Subject: [PATCH 56/58] tools/bpf: bpftool: support prog array map and map of 
 maps

ANBZ: #5530

commit ad3338d2508cd7752accdd39881deced1ec2b8a1 upstream

Currently, prog array map and map of maps are not supported
in bpftool. This patch added the support.
Different from other map types, for prog array map and
map of maps, the key returned bpf_get_next_key() may not
point to a valid value. So for these two map types,
no error will be printed out when such a scenario happens.

The following is the plain and json dump if btf is not available:
  $ ./bpftool map dump id 10
    key: 08 00 00 00  value: 5c 01 00 00
    Found 1 element
  $ ./bpftool -jp map dump id 10
    [{
        "key": ["0x08","0x00","0x00","0x00"
        ],
        "value": ["0x5c","0x01","0x00","0x00"
        ]
    }]

If the BTF is available, the dump looks below:
  $ ./bpftool map dump id 2
    [{
            "key": 0,
            "value": 7
        }
    ]
  $ ./bpftool -jp map dump id 2
    [{
        "key": ["0x00","0x00","0x00","0x00"
        ],
        "value": ["0x07","0x00","0x00","0x00"
        ],
        "formatted": {
            "key": 0,
            "value": 7
        }
    }]

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: zhaoxuezhi <zhaoxuezhi@uniontech.com>
Acked-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1831
---
 tools/bpf/bpftool/map.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index ec73d83d0d31..f41e2907a336 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -660,12 +660,6 @@ static int do_dump(int argc, char **argv)
 	if (fd < 0)
 		return -1;
 
-	if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) {
-		p_err("Dumping maps of maps and program maps not supported");
-		close(fd);
-		return -1;
-	}
-
 	key = malloc(info.key_size);
 	value = alloc_value(&info);
 	if (!key || !value) {
@@ -719,7 +713,9 @@ static int do_dump(int argc, char **argv)
 				} else {
 					print_entry_plain(&info, key, value);
 				}
-		} else {
+			num_elems++;
+		} else if (!map_is_map_of_maps(info.type) &&
+			   !map_is_map_of_progs(info.type)) {
 			if (json_output) {
 				jsonw_name(json_wtr, "key");
 				print_hex_data_json(key, info.key_size);
@@ -736,7 +732,6 @@ static int do_dump(int argc, char **argv)
 		}
 
 		prev_key = key;
-		num_elems++;
 	}
 
 	if (json_output)
-- 
Gitee


From 48e49bbfb870fb2f99ea427b8eba26bcd9d1011a Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:17 +0800
Subject: [PATCH 57/58] sched/rt: pick_next_rt_entity(): check list_entry

ANBZ: #4281

commit 7c4a5b89a0b5a57a64b601775b296abf77a9fe97 upstream.

Commit 326587b84078 ("sched: fix goto retry in pick_next_task_rt()")
removed any path which could make pick_next_rt_entity() return NULL.
However, BUG_ON(!rt_se) in _pick_next_task_rt() (the only caller of
pick_next_rt_entity()) still checks the error condition, which can
never happen, since list_entry() never returns NULL.
Remove the BUG_ON check, and instead emit a warning in the only
possible error condition here: the queue being empty which should
never happen.

Fixes: 326587b84078 ("sched: fix goto retry in pick_next_task_rt()")
Signed-off-by: Pietro Borrello <borrello@diag.uniroma1.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20230128-list-entry-null-check-sched-v3-1-b1a71bd1ac6b@diag.uniroma1.it
Signed-off-by: Erwei Deng <erwei@linux.alibaba.com>

[Fixes conflicts]
Fixes: CVE-2023-1077
Signed-off-by: hengqi.hq@alibaba-inc.com
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
Reviewed-by: Artie Ding <artie.ding@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1459
---
 kernel/sched/rt.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f5f522d5151a..53e294313ab7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1583,6 +1583,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	BUG_ON(idx >= MAX_RT_PRIO);
 
 	queue = array->queue + idx;
+	if (SCHED_WARN_ON(list_empty(queue)))
+		return NULL;
 	next = list_entry(queue->next, struct sched_rt_entity, run_list);
 
 	return next;
@@ -1596,7 +1598,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
-		BUG_ON(!rt_se);
+		if (unlikely(!rt_se))
+			return NULL;
 		rt_rq = group_rt_rq(rt_se);
 	} while (rt_rq);
 
-- 
Gitee


From e095f9d69eeecad128f7534fba1050b9fd49b953 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Tue, 11 Jul 2023 10:45:18 +0800
Subject: [PATCH 58/58] netfilter: nf_tables: incorrect error path handling 
 with NFT_MSG_NEWRULE

ANBZ: #5703

commit 1240eb93f0616b21c675416516ff3d74798fdc97 upstream.

[Fixes conflicts]
Fixes: CVE-2023-3117
Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
Signed-off-by: Xiao Long <xiaolong@openanolis.org>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/1864
---
 net/netfilter/nf_tables_api.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3676317e8880..bf19b384f940 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2730,7 +2730,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 
 	return 0;
 err2:
-	nf_tables_rule_release(&ctx, rule);
+	nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_RELEASE);
+	nf_tables_rule_destroy(&ctx, rule);
 err1:
 	for (i = 0; i < n; i++) {
 		if (info[i].ops) {
-- 
Gitee