From 8af4501bfda56bb325a3222df0c1a50c347f64d6 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Tue, 19 Oct 2021 18:03:59 +0800
Subject: [PATCH 001/134] ext4: move inode eio simulation behind io completeion

mainline inclusion
from mainline-5.15-rc1
commit 0904c9ae3465c7acc066a564a76b75c0af83e6c7
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0904c9ae3465c7acc066a564a76b75c0af83e6c7

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

No EIO simulation is required if the buffer is uptodate, so move the
simulation behind read bio completeion just like inode/block bitmap
simulation does.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210826130412.3921207-2-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 05ab7b65ea52..9fa1ed48b6b2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4341,8 +4341,6 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
 	bh = sb_getblk(sb, block);
 	if (unlikely(!bh))
 		return -ENOMEM;
-	if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
-		goto simulate_eio;
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 
@@ -4429,8 +4427,8 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
 		ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
 		blk_finish_plug(&plug);
 		wait_on_buffer(bh);
+		ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
 		if (!buffer_uptodate(bh)) {
-		simulate_eio:
 			if (ret_block)
 				*ret_block = block;
 			brelse(bh);
-- 
Gitee


From bde0f2d3354a044f4aa7bbe5506566679dcd9054 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Tue, 19 Oct 2021 18:04:00 +0800
Subject: [PATCH 002/134] ext4: make the updating inode data procedure atomic

mainline inclusion
from mainline-5.15-rc1
commit baaae979b112642a41b71c71c599d875c067d257
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=baaae979b112642a41b71c71c599d875c067d257

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Now that ext4_do_update_inode() return error before filling the whole
inode data if we fail to set inode blocks in ext4_inode_blocks_set().
This error should never happen in theory since sb->s_maxbytes should not
have allowed this, we have already init sb->s_maxbytes according to this
feature in ext4_fill_super(). So even through that could only happen due
to the filesystem corruption, we'd better to return after we finish
updating the inode because it may left an uninitialized buffer and we
could read this buffer later in "errors=continue" mode.

This patch make the updating inode data procedure atomic, call
EXT4_ERROR_INODE() after we dropping i_raw_lock after something bad
happened, make sure that the inode is integrated, and also drop a BUG_ON
and do some small cleanups.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210826130412.3921207-4-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/inode.c | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9fa1ed48b6b2..c5e0bd992b17 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4937,8 +4937,14 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		return 0;
 	}
+
+	/*
+	 * This should never happen since sb->s_maxbytes should not have
+	 * allowed this, sb->s_maxbytes was set according to the huge_file
+	 * feature in ext4_fill_super().
+	 */
 	if (!ext4_has_feature_huge_file(sb))
-		return -EFBIG;
+		return -EFSCORRUPTED;
 
 	if (i_blocks <= 0xffffffffffffULL) {
 		/*
@@ -5045,16 +5051,14 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	spin_lock(&ei->i_raw_lock);
 
-	/* For fields not tracked in the in-memory inode,
-	 * initialise them to zero for new inodes. */
+	/*
+	 * For fields not tracked in the in-memory inode, initialise them
+	 * to zero for new inodes.
+	 */
 	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
 	err = ext4_inode_blocks_set(handle, raw_inode, ei);
-	if (err) {
-		spin_unlock(&ei->i_raw_lock);
-		goto out_brelse;
-	}
 
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	i_uid = i_uid_read(inode);
@@ -5063,10 +5067,11 @@ static int ext4_do_update_inode(handle_t *handle,
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
-/*
- * Fix up interoperability with old kernels. Otherwise, old inodes get
- * re-used with the upper 16 bits of the uid/gid intact
- */
+		/*
+		 * Fix up interoperability with old kernels. Otherwise,
+		 * old inodes get re-used with the upper 16 bits of the
+		 * uid/gid intact.
+		 */
 		if (ei->i_dtime && list_empty(&ei->i_orphan)) {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
@@ -5135,8 +5140,9 @@ static int ext4_do_update_inode(handle_t *handle,
 		}
 	}
 
-	BUG_ON(!ext4_has_feature_project(inode->i_sb) &&
-	       i_projid != EXT4_DEF_PROJID);
+	if (i_projid != EXT4_DEF_PROJID &&
+	    !ext4_has_feature_project(inode->i_sb))
+		err = err ?: -EFSCORRUPTED;
 
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
 	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
@@ -5144,6 +5150,11 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
 	spin_unlock(&ei->i_raw_lock);
+	if (err) {
+		EXT4_ERROR_INODE(inode, "corrupted inode contents");
+		goto out_brelse;
+	}
+
 	if (inode->i_sb->s_flags & SB_LAZYTIME)
 		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
 					      bh->b_data);
@@ -5151,13 +5162,13 @@ static int ext4_do_update_inode(handle_t *handle,
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, bh);
 	if (err)
-		goto out_brelse;
+		goto out_error;
 	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	if (set_large_file) {
 		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
 		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
 		if (err)
-			goto out_brelse;
+			goto out_error;
 		lock_buffer(EXT4_SB(sb)->s_sbh);
 		ext4_set_feature_large_file(sb);
 		ext4_superblock_csum_set(sb);
@@ -5167,9 +5178,10 @@ static int ext4_do_update_inode(handle_t *handle,
 						 EXT4_SB(sb)->s_sbh);
 	}
 	ext4_update_inode_fsync_trans(handle, inode, need_datasync);
+out_error:
+	ext4_std_error(inode->i_sb, err);
 out_brelse:
 	brelse(bh);
-	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 
-- 
Gitee


From a9adbf4104f9d01d90c6e0e2652c3e3f7c957caf Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Tue, 19 Oct 2021 18:04:01 +0800
Subject: [PATCH 003/134] ext4: factor out ext4_fill_raw_inode()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Factor out ext4_fill_raw_inode() from ext4_do_update_inode(), which is
use to fill the in-mem inode contents into the inode table buffer, in
preparation for initializing the exclusive inode buffer without reading
the block in __ext4_get_inode_loc().

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/inode.c | 85 +++++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 38 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c5e0bd992b17..cbc811ec1ef9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4919,9 +4919,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	return ERR_PTR(ret);
 }
 
-static int ext4_inode_blocks_set(handle_t *handle,
-				struct ext4_inode *raw_inode,
-				struct ext4_inode_info *ei)
+static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
+				 struct ext4_inode_info *ei)
 {
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = READ_ONCE(inode->i_blocks);
@@ -5028,37 +5027,16 @@ static void ext4_update_other_inodes_time(struct super_block *sb,
 	rcu_read_unlock();
 }
 
-/*
- * Post the struct inode info into an on-disk inode location in the
- * buffer-cache.  This gobbles the caller's reference to the
- * buffer_head in the inode location struct.
- *
- * The caller must have write access to iloc->bh.
- */
-static int ext4_do_update_inode(handle_t *handle,
-				struct inode *inode,
-				struct ext4_iloc *iloc)
+static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
 {
-	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct buffer_head *bh = iloc->bh;
-	struct super_block *sb = inode->i_sb;
-	int err = 0, block;
-	int need_datasync = 0, set_large_file = 0;
 	uid_t i_uid;
 	gid_t i_gid;
 	projid_t i_projid;
+	int block;
+	int err;
 
-	spin_lock(&ei->i_raw_lock);
-
-	/*
-	 * For fields not tracked in the in-memory inode, initialise them
-	 * to zero for new inodes.
-	 */
-	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
-		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
-
-	err = ext4_inode_blocks_set(handle, raw_inode, ei);
+	err = ext4_inode_blocks_set(raw_inode, ei);
 
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	i_uid = i_uid_read(inode);
@@ -5100,16 +5078,8 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-	if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) {
-		ext4_isize_set(raw_inode, ei->i_disksize);
-		need_datasync = 1;
-	}
-	if (ei->i_disksize > 0x7fffffffULL) {
-		if (!ext4_has_feature_large_file(sb) ||
-				EXT4_SB(sb)->s_es->s_rev_level ==
-		    cpu_to_le32(EXT4_GOOD_OLD_REV))
-			set_large_file = 1;
-	}
+	ext4_isize_set(raw_inode, ei->i_disksize);
+
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
@@ -5149,6 +5119,45 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_projid = cpu_to_le32(i_projid);
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
+	return err;
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache.  This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext4_do_update_inode(handle_t *handle,
+				struct inode *inode,
+				struct ext4_iloc *iloc)
+{
+	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct buffer_head *bh = iloc->bh;
+	struct super_block *sb = inode->i_sb;
+	int err;
+	int need_datasync = 0, set_large_file = 0;
+
+	spin_lock(&ei->i_raw_lock);
+
+	/*
+	 * For fields not tracked in the in-memory inode, initialise them
+	 * to zero for new inodes.
+	 */
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+
+	if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
+		need_datasync = 1;
+	if (ei->i_disksize > 0x7fffffffULL) {
+		if (!ext4_has_feature_large_file(sb) ||
+		    EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
+			set_large_file = 1;
+	}
+
+	err = ext4_fill_raw_inode(inode, raw_inode);
 	spin_unlock(&ei->i_raw_lock);
 	if (err) {
 		EXT4_ERROR_INODE(inode, "corrupted inode contents");
-- 
Gitee


From 321e36ab1c002398f545cff00bb9e572afa52366 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Tue, 19 Oct 2021 18:04:02 +0800
Subject: [PATCH 004/134] ext4: move ext4_fill_raw_inode() related functions

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

In preparation for calling ext4_fill_raw_inode() in
__ext4_get_inode_loc(), move three related functions before
__ext4_get_inode_loc(), no logical change.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/inode.c | 293 ++++++++++++++++++++++++------------------------
 1 file changed, 147 insertions(+), 146 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbc811ec1ef9..1eefbc464964 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4303,6 +4303,153 @@ int ext4_truncate(struct inode *inode)
 	return err;
 }
 
+static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+{
+	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+		return inode_peek_iversion_raw(inode);
+	else
+		return inode_peek_iversion(inode);
+}
+
+static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
+				 struct ext4_inode_info *ei)
+{
+	struct inode *inode = &(ei->vfs_inode);
+	u64 i_blocks = READ_ONCE(inode->i_blocks);
+	struct super_block *sb = inode->i_sb;
+
+	if (i_blocks <= ~0U) {
+		/*
+		 * i_blocks can be represented in a 32 bit variable
+		 * as multiple of 512 bytes
+		 */
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = 0;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+		return 0;
+	}
+
+	/*
+	 * This should never happen since sb->s_maxbytes should not have
+	 * allowed this, sb->s_maxbytes was set according to the huge_file
+	 * feature in ext4_fill_super().
+	 */
+	if (!ext4_has_feature_huge_file(sb))
+		return -EFSCORRUPTED;
+
+	if (i_blocks <= 0xffffffffffffULL) {
+		/*
+		 * i_blocks can be represented in a 48 bit variable
+		 * as multiple of 512 bytes
+		 */
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+	} else {
+		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+		/* i_block is stored in file system block size */
+		i_blocks = i_blocks >> (inode->i_blkbits - 9);
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+	}
+	return 0;
+}
+
+static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	uid_t i_uid;
+	gid_t i_gid;
+	projid_t i_projid;
+	int block;
+	int err;
+
+	err = ext4_inode_blocks_set(raw_inode, ei);
+
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	i_uid = i_uid_read(inode);
+	i_gid = i_gid_read(inode);
+	i_projid = from_kprojid(&init_user_ns, ei->i_projid);
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
+		/*
+		 * Fix up interoperability with old kernels. Otherwise,
+		 * old inodes get re-used with the upper 16 bits of the
+		 * uid/gid intact.
+		 */
+		if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		} else {
+			raw_inode->i_uid_high =
+				cpu_to_le16(high_16_bits(i_uid));
+			raw_inode->i_gid_high =
+				cpu_to_le16(high_16_bits(i_gid));
+		}
+	} else {
+		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
+		raw_inode->i_file_acl_high =
+			cpu_to_le16(ei->i_file_acl >> 32);
+	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+	ext4_isize_set(raw_inode, ei->i_disksize);
+
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else if (!ext4_has_inline_data(inode)) {
+		for (block = 0; block < EXT4_N_BLOCKS; block++)
+			raw_inode->i_block[block] = ei->i_data[block];
+	}
+
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+		u64 ivers = ext4_inode_peek_iversion(inode);
+
+		raw_inode->i_disk_version = cpu_to_le32(ivers);
+		if (ei->i_extra_isize) {
+			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+				raw_inode->i_version_hi =
+					cpu_to_le32(ivers >> 32);
+			raw_inode->i_extra_isize =
+				cpu_to_le16(ei->i_extra_isize);
+		}
+	}
+
+	if (i_projid != EXT4_DEF_PROJID &&
+	    !ext4_has_feature_project(inode->i_sb))
+		err = err ?: -EFSCORRUPTED;
+
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+		raw_inode->i_projid = cpu_to_le32(i_projid);
+
+	ext4_inode_csum_set(inode, raw_inode, ei);
+	return err;
+}
+
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -4597,13 +4744,6 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
 	else
 		inode_set_iversion_queried(inode, val);
 }
-static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
-{
-	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
-		return inode_peek_iversion_raw(inode);
-	else
-		return inode_peek_iversion(inode);
-}
 
 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 			  ext4_iget_flags flags, const char *function,
@@ -4919,50 +5059,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	return ERR_PTR(ret);
 }
 
-static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
-				 struct ext4_inode_info *ei)
-{
-	struct inode *inode = &(ei->vfs_inode);
-	u64 i_blocks = READ_ONCE(inode->i_blocks);
-	struct super_block *sb = inode->i_sb;
-
-	if (i_blocks <= ~0U) {
-		/*
-		 * i_blocks can be represented in a 32 bit variable
-		 * as multiple of 512 bytes
-		 */
-		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
-		raw_inode->i_blocks_high = 0;
-		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
-		return 0;
-	}
-
-	/*
-	 * This should never happen since sb->s_maxbytes should not have
-	 * allowed this, sb->s_maxbytes was set according to the huge_file
-	 * feature in ext4_fill_super().
-	 */
-	if (!ext4_has_feature_huge_file(sb))
-		return -EFSCORRUPTED;
-
-	if (i_blocks <= 0xffffffffffffULL) {
-		/*
-		 * i_blocks can be represented in a 48 bit variable
-		 * as multiple of 512 bytes
-		 */
-		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
-		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
-	} else {
-		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
-		/* i_block is stored in file system block size */
-		i_blocks = i_blocks >> (inode->i_blkbits - 9);
-		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
-		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-	}
-	return 0;
-}
-
 static void __ext4_update_other_inode_time(struct super_block *sb,
 					   unsigned long orig_ino,
 					   unsigned long ino,
@@ -5027,101 +5123,6 @@ static void ext4_update_other_inodes_time(struct super_block *sb,
 	rcu_read_unlock();
 }
 
-static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	uid_t i_uid;
-	gid_t i_gid;
-	projid_t i_projid;
-	int block;
-	int err;
-
-	err = ext4_inode_blocks_set(raw_inode, ei);
-
-	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-	i_uid = i_uid_read(inode);
-	i_gid = i_gid_read(inode);
-	i_projid = from_kprojid(&init_user_ns, ei->i_projid);
-	if (!(test_opt(inode->i_sb, NO_UID32))) {
-		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
-		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
-		/*
-		 * Fix up interoperability with old kernels. Otherwise,
-		 * old inodes get re-used with the upper 16 bits of the
-		 * uid/gid intact.
-		 */
-		if (ei->i_dtime && list_empty(&ei->i_orphan)) {
-			raw_inode->i_uid_high = 0;
-			raw_inode->i_gid_high = 0;
-		} else {
-			raw_inode->i_uid_high =
-				cpu_to_le16(high_16_bits(i_uid));
-			raw_inode->i_gid_high =
-				cpu_to_le16(high_16_bits(i_gid));
-		}
-	} else {
-		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
-		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
-		raw_inode->i_uid_high = 0;
-		raw_inode->i_gid_high = 0;
-	}
-	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-
-	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
-	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
-	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
-	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
-
-	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
-	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
-		raw_inode->i_file_acl_high =
-			cpu_to_le16(ei->i_file_acl >> 32);
-	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-	ext4_isize_set(raw_inode, ei->i_disksize);
-
-	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		if (old_valid_dev(inode->i_rdev)) {
-			raw_inode->i_block[0] =
-				cpu_to_le32(old_encode_dev(inode->i_rdev));
-			raw_inode->i_block[1] = 0;
-		} else {
-			raw_inode->i_block[0] = 0;
-			raw_inode->i_block[1] =
-				cpu_to_le32(new_encode_dev(inode->i_rdev));
-			raw_inode->i_block[2] = 0;
-		}
-	} else if (!ext4_has_inline_data(inode)) {
-		for (block = 0; block < EXT4_N_BLOCKS; block++)
-			raw_inode->i_block[block] = ei->i_data[block];
-	}
-
-	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
-		u64 ivers = ext4_inode_peek_iversion(inode);
-
-		raw_inode->i_disk_version = cpu_to_le32(ivers);
-		if (ei->i_extra_isize) {
-			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-				raw_inode->i_version_hi =
-					cpu_to_le32(ivers >> 32);
-			raw_inode->i_extra_isize =
-				cpu_to_le16(ei->i_extra_isize);
-		}
-	}
-
-	if (i_projid != EXT4_DEF_PROJID &&
-	    !ext4_has_feature_project(inode->i_sb))
-		err = err ?: -EFSCORRUPTED;
-
-	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
-	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
-		raw_inode->i_projid = cpu_to_le32(i_projid);
-
-	ext4_inode_csum_set(inode, raw_inode, ei);
-	return err;
-}
-
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
-- 
Gitee


From c5dcb0a6f485a315337721e85e8dc321adac2588 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Tue, 19 Oct 2021 18:04:03 +0800
Subject: [PATCH 005/134] ext4: prevent getting empty inode buffer

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

In ext4_get_inode_loc(), we may skip IO and get an zero && uptodate
inode buffer when the inode monopolize an inode block for performance
reason. For most cases, ext4_mark_iloc_dirty() will fill the inode
buffer to make it fine, but we could miss this call if something bad
happened. Finally, __ext4_get_inode_loc_noinmem() may probably get an
empty inode buffer and trigger ext4 error.

For example, if we remove a nonexistent xattr on inode A,
ext4_xattr_set_handle() will return ENODATA before invoking
ext4_mark_iloc_dirty(), it will left an uptodate but zero buffer. We
will get checksum error message in ext4_iget() when getting inode again.

  EXT4-fs error (device sda): ext4_lookup:1784: inode #131074: comm cat: iget: checksum invalid

Even worse, if we allocate another inode B at the same inode block, it
will corrupt the inode A on disk when write back inode B.

So this patch initialize the inode buffer by filling the in-mem inode
contents if we skip read I/O, ensure that the buffer is really uptodate.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/inode.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1eefbc464964..16bc65609496 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4452,12 +4452,12 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
 
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
- * underlying buffer_head on success. If 'in_mem' is true, we have all
- * data in memory that is needed to recreate the on-disk version of this
- * inode.
+ * underlying buffer_head on success. If we pass 'inode' and it does not
+ * have in-inode xattr, we have all inode data in memory that is needed
+ * to recreate the on-disk version of this inode.
  */
 static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
-				struct ext4_iloc *iloc, int in_mem,
+				struct inode *inode, struct ext4_iloc *iloc,
 				ext4_fsblk_t *ret_block)
 {
 	struct ext4_group_desc	*gdp;
@@ -4502,7 +4502,7 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
 		 * is the only valid inode in the block, we need not read the
 		 * block.
 		 */
-		if (in_mem) {
+		if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 			struct buffer_head *bitmap_bh;
 			int i, start;
 
@@ -4530,8 +4530,13 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
 			}
 			brelse(bitmap_bh);
 			if (i == start + inodes_per_block) {
+				struct ext4_inode *raw_inode =
+					(struct ext4_inode *) (bh->b_data + iloc->offset);
+
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
+				if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
+					ext4_fill_raw_inode(inode, raw_inode);
 				set_buffer_uptodate(bh);
 				unlock_buffer(bh);
 				goto has_buffer;
@@ -4593,7 +4598,7 @@ static int __ext4_get_inode_loc_noinmem(struct inode *inode,
 	ext4_fsblk_t err_blk;
 	int ret;
 
-	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
+	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
 					&err_blk);
 
 	if (ret == -EIO)
@@ -4608,9 +4613,8 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 	ext4_fsblk_t err_blk;
 	int ret;
 
-	/* We have all inode data except xattrs in memory here. */
-	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
-		!ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
+	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
+					&err_blk);
 
 	if (ret == -EIO)
 		ext4_error_inode_block(inode, err_blk, EIO,
@@ -4623,7 +4627,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
 			  struct ext4_iloc *iloc)
 {
-	return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
+	return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
 }
 
 static bool ext4_should_enable_dax(struct inode *inode)
-- 
Gitee


From d6aac746628fb1cfc72c0b95e39c3a87c5f8841c Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 19 Oct 2021 18:04:04 +0800
Subject: [PATCH 006/134] blk-mq: fix divide by zero crash in tg_may_dispatch()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

If blk-throttle is enabled and io is issued before
blk_throtl_register_queue() is done. Divide by zero crash will be
triggered in tg_may_dispatch() because 'throtl_slice' is uninitialized.

Thus introduce a new flag QUEUE_FLAG_THROTL_INIT_DONE. It will be set
after blk_throtl_register_queue() is done, and will be checked before
applying any config.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-sysfs.c      |  7 +++++++
 block/blk-throttle.c   | 37 ++++++++++++++++++++++++++++++++++++-
 include/linux/blkdev.h |  1 +
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..66765740902b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -910,6 +910,9 @@ int blk_register_queue(struct gendisk *disk)
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	wbt_enable_default(q);
 	blk_throtl_register_queue(q);
+	spin_lock_irq(&q->queue_lock);
+	blk_queue_flag_set(QUEUE_FLAG_THROTL_INIT_DONE, q);
+	spin_unlock_irq(&q->queue_lock);
 
 	/* Now everything is ready and send out KOBJ_ADD uevent */
 	kobject_uevent(&q->kobj, KOBJ_ADD);
@@ -942,6 +945,10 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (!blk_queue_registered(q))
 		return;
 
+	spin_lock_irq(&q->queue_lock);
+	blk_queue_flag_clear(QUEUE_FLAG_THROTL_INIT_DONE, q);
+	spin_unlock_irq(&q->queue_lock);
+
 	/*
 	 * Since sysfs_remove_dir() prevents adding new directory entries
 	 * before removal of existing entries starts, protect against
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c53a254171a2..4087dcc13f7d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -11,6 +11,7 @@
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-cgroup.h>
+#include <linux/delay.h>
 #include "blk.h"
 #include "blk-cgroup-rwstat.h"
 
@@ -1456,6 +1457,31 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
 	}
 }
 
+static inline int throtl_check_init_done(struct request_queue *q)
+{
+	if (test_bit(QUEUE_FLAG_THROTL_INIT_DONE, &q->queue_flags))
+		return 0;
+
+	return blk_queue_dying(q) ? -ENODEV : -EBUSY;
+}
+
+/*
+ * If throtl_check_init_done() return -EBUSY, we should retry after a short
+ * msleep(), since that throttle init will be completed in blk_register_queue()
+ * soon.
+ */
+static inline int throtl_restart_syscall_when_busy(int errno)
+{
+	int ret = errno;
+
+	if (ret == -EBUSY) {
+		msleep(10);
+		ret = restart_syscall();
+	}
+
+	return ret;
+}
+
 static ssize_t tg_set_conf(struct kernfs_open_file *of,
 			   char *buf, size_t nbytes, loff_t off, bool is_u64)
 {
@@ -1469,6 +1495,10 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
+	ret = throtl_check_init_done(ctx.disk->queue);
+	if (ret)
+		goto out_finish;
+
 	ret = -EINVAL;
 	if (sscanf(ctx.body, "%llu", &v) != 1)
 		goto out_finish;
@@ -1486,6 +1516,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
+	ret = throtl_restart_syscall_when_busy(ret);
 	return ret ?: nbytes;
 }
 
@@ -1661,8 +1692,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
-	tg = blkg_to_tg(ctx.blkg);
+	ret = throtl_check_init_done(ctx.disk->queue);
+	if (ret)
+		goto out_finish;
 
+	tg = blkg_to_tg(ctx.blkg);
 	v[0] = tg->bps_conf[READ][index];
 	v[1] = tg->bps_conf[WRITE][index];
 	v[2] = tg->iops_conf[READ][index];
@@ -1758,6 +1792,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
+	ret = throtl_restart_syscall_when_busy(ret);
 	return ret ?: nbytes;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8aae375864b6..7517381b5e15 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -593,6 +593,7 @@ struct request_queue {
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
 #define QUEUE_FLAG_STOPPED	0	/* queue is stopped */
 #define QUEUE_FLAG_DYING	1	/* queue being torn down */
+#define QUEUE_FLAG_THROTL_INIT_DONE 2	/* io throttle can be online */
 #define QUEUE_FLAG_NOMERGES     3	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	4	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO	5	/* fake timeout */
-- 
Gitee


From 7817c5defcc35d05ea67a4eb1dbd993f7dca7923 Mon Sep 17 00:00:00 2001
From: Yutian Yang <nglaive@gmail.com>
Date: Tue, 19 Oct 2021 18:04:06 +0800
Subject: [PATCH 007/134] memcg: charge fs_context and legacy_fs_context

mainline inclusion
from mainline-v5.15-rc1
commit bb902cb47cf93b33cd92b3b7a4019330a03ef57f
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb902cb47cf93b33cd92b3b7a4019330a03ef57f

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

This patch adds accounting flags to fs_context and legacy_fs_context
allocation sites so that kernel could correctly charge these objects.

We have written a PoC to demonstrate the effect of the missing-charging
bugs.  The PoC takes around 1,200MB unaccounted memory, while it is
charged for only 362MB memory usage.  We evaluate the PoC on QEMU x86_64
v5.2.90 + Linux kernel v5.10.19 + Debian buster.  All the limitations
including ulimits and sysctl variables are set as default.  Specifically,
the hard NOFILE limit and nr_open in sysctl are both 1,048,576.

/*------------------------- POC code ----------------------------*/

                        } while (0)

static inline int fsopen(const char *fs_name, unsigned int flags)
{
        return syscall(__NR_fsopen, fs_name, flags);
}

static char thread_stack[512][STACK_SIZE];

int thread_fn(void* arg)
{
  for (int i = 0; i< 800000; ++i) {
    int fsfd = fsopen("nfs", FSOPEN_CLOEXEC);
    if (fsfd == -1) {
      errExit("fsopen");
    }
  }
  while(1);
  return 0;
}

int main(int argc, char *argv[]) {
  int thread_pid;
  for (int i = 0; i < 1; ++i) {
    thread_pid = clone(thread_fn, thread_stack[i] + STACK_SIZE, \
      SIGCHLD, NULL);
  }
  while(1);
  return 0;
}

/*-------------------------- end --------------------------------*/

Link: https://lkml.kernel.org/r/1626517201-24086-1-git-send-email-nglaive@gmail.com
Signed-off-by: Yutian Yang <nglaive@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <shenwenbo@zju.edu.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Li Ming <limingming.li@huawei.com>
Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/fs_context.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 2834d1afa6e8..4858645ca620 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -231,7 +231,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 	struct fs_context *fc;
 	int ret = -ENOMEM;
 
-	fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+	fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
 	if (!fc)
 		return ERR_PTR(-ENOMEM);
 
@@ -631,7 +631,7 @@ const struct fs_context_operations legacy_fs_context_ops = {
  */
 static int legacy_init_fs_context(struct fs_context *fc)
 {
-	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
 	if (!fc->fs_private)
 		return -ENOMEM;
 	fc->ops = &legacy_fs_context_ops;
-- 
Gitee


From 0e98823f9a01bca481b6900ff921da071931a159 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 19 Oct 2021 18:04:07 +0800
Subject: [PATCH 008/134] memcg: enable accounting for mnt_cache entries

mainline inclusion
from mainline-v5.15-rc1
commit 79f6540ba88dfb383ecf057a3425e668105ca774
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=79f6540ba88dfb383ecf057a3425e668105ca774

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

Patch series "memcg accounting from OpenVZ", v7.

OpenVZ uses memory accounting 20+ years since v2.2.x linux kernels.
Initially we used our own accounting subsystem, then partially committed
it to upstream, and a few years ago switched to cgroups v1.  Now we're
rebasing again, revising our old patches and trying to push them upstream.

We try to protect the host system from any misuse of kernel memory
allocation triggered by untrusted users inside the containers.

Patch-set is addressed mostly to cgroups maintainers and cgroups@ mailing
list, though I would be very grateful for any comments from maintainersi
of affected subsystems or other people added in cc:

Compared to the upstream, we additionally account the following kernel objects:
- network devices and its Tx/Rx queues
- ipv4/v6 addresses and routing-related objects
- inet_bind_bucket cache objects
- VLAN group arrays
- ipv6/sit: ip_tunnel_prl
- scm_fp_list objects used by SCM_RIGHTS messages of Unix sockets
- nsproxy and namespace objects itself
- IPC objects: semaphores, message queues and share memory segments
- mounts
- pollfd and select bits arrays
- signals and posix timers
- file lock
- fasync_struct used by the file lease code and driver's fasync queues
- tty objects
- per-mm LDT

We have an incorrect/incomplete/obsoleted accounting for few other kernel
objects: sk_filter, af_packets, netlink and xt_counters for iptables.
They require rework and probably will be dropped at all.

Also we're going to add an accounting for nft, however it is not ready
yet.

We have not tested performance on upstream, however, our performance team
compares our current RHEL7-based production kernel and reports that they
are at least not worse as the according original RHEL7 kernel.

This patch (of 10):

The kernel allocates ~400 bytes of 'struct mount' for any new mount.
Creating a new mount namespace clones most of the parent mounts, and this
can be repeated many times.  Additionally, each mount allocates up to
PATH_MAX=4096 bytes for mnt->mnt_devname.

It makes sense to account for these allocations to restrict the host's
memory consumption from inside the memcg-limited container.

Link: https://lkml.kernel.org/r/045db11f-4a45-7c9b-2664-5b32c2b44943@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Li Ming <limingming.li@huawei.com>

Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/namespace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 046b084136c5..7f1f89db511f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,8 @@ static struct mount *alloc_vfsmnt(const char *name)
 			goto out_free_cache;
 
 		if (name) {
-			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
+			mnt->mnt_devname = kstrdup_const(name,
+							 GFP_KERNEL_ACCOUNT);
 			if (!mnt->mnt_devname)
 				goto out_free_id;
 		}
@@ -3840,7 +3841,7 @@ void __init mnt_init(void)
 	int err;
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	mount_hashtable = alloc_large_system_hash("Mount-cache",
 				sizeof(struct hlist_head),
-- 
Gitee


From 047b4e2ffcfcbf652ef2f5ccb31a998b4b436d08 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 19 Oct 2021 18:04:08 +0800
Subject: [PATCH 009/134] memcg: enable accounting for fasync_cache

mainline inclusion
from mainline-v5.15-rc1
commit 839d68206de869b8cb4272c5ea10da2ef7ec34cb
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=839d68206de869b8cb4272c5ea10da2ef7ec34cb

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

fasync_struct is used by almost all character device drivers to set up the
fasync queue, and for regular files by the file lease code.  This
structure is quite small but long-living and it can be assigned for any
open file.

It makes sense to account for its allocations to restrict the host's
memory consumption from inside the memcg-limited container.

Link: https://lkml.kernel.org/r/1b408625-d71c-0b26-b0b6-9baf00f93e69@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Li Ming <limingming.li@huawei.com>

Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/fcntl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 71b43538fa44..3eebcbead8bf 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1042,7 +1042,8 @@ static int __init fcntl_init(void)
 			__FMODE_EXEC | __FMODE_NONOTIFY));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
-		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+					 sizeof(struct fasync_struct), 0,
+					 SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	return 0;
 }
 
-- 
Gitee


From b5ff04ca7493cdf85e18c13bc4a429ab6e5d88b2 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 19 Oct 2021 18:04:09 +0800
Subject: [PATCH 010/134] memcg: enable accounting for new namesapces and
 struct nsproxy

mainline inclusion
from mainline-v5.15-rc1
commit 30acd0bdfb86548172168a0cc71d455944de0683
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=30acd0bdfb86548172168a0cc71d455944de0683

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

Container admin can create new namespaces and force kernel to allocate up
to several pages of memory for the namespaces and its associated
structures.

Net and uts namespaces have enabled accounting for such allocations.  It
makes sense to account for rest ones to restrict the host's memory
consumption from inside the memcg-limited container.

Link: https://lkml.kernel.org/r/5525bcbf-533e-da27-79b7-158686c64e13@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Conflicts:
	kernel/time/namespace.c
Signed-off-by: Li Ming <limingming.li@huawei.com>

Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/namespace.c            | 2 +-
 ipc/namespace.c           | 2 +-
 kernel/cgroup/namespace.c | 2 +-
 kernel/nsproxy.c          | 2 +-
 kernel/pid_namespace.c    | 2 +-
 kernel/time/namespace.c   | 4 ++--
 kernel/user_namespace.c   | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7f1f89db511f..6e76f2a72cfc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3283,7 +3283,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	if (!ucounts)
 		return ERR_PTR(-ENOSPC);
 
-	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
 	if (!new_ns) {
 		dec_mnt_namespaces(ucounts);
 		return ERR_PTR(-ENOMEM);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 24e7b45320f7..c94c05846141 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 		goto fail;
 
 	err = -ENOMEM;
-	ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
+	ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT);
 	if (ns == NULL)
 		goto fail_dec;
 
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 812a61afd538..12c5110466bc 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
 	struct cgroup_namespace *new_ns;
 	int ret;
 
-	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
 	ret = ns_alloc_inum(&new_ns->ns);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 12dd41b39a7f..3d5a5faf91b5 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -573,6 +573,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
 
 int __init nsproxy_cache_init(void)
 {
-	nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+	nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
 	return 0;
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ef8733e2a476..52c017feabcb 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -457,7 +457,7 @@ const struct proc_ns_operations pidns_for_children_operations = {
 
 static __init int pid_namespaces_init(void)
 {
-	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	register_sysctl_paths(kern_path, pid_ns_ctl_table);
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index afc65e6be33e..00c20f7fdc02 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
 		goto fail;
 
 	err = -ENOMEM;
-	ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+	ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
 	if (!ns)
 		goto fail_dec;
 
 	kref_init(&ns->kref);
 
-	ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	if (!ns->vvar_page)
 		goto fail_free;
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ce396ea4de60..2c15bf6680c3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1378,7 +1378,7 @@ const struct proc_ns_operations userns_operations = {
 
 static __init int user_namespaces_init(void)
 {
-	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
 	return 0;
 }
 subsys_initcall(user_namespaces_init);
-- 
Gitee


From 4361a3fb6e8229294b1fd2febb195cecb4afff07 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 19 Oct 2021 18:04:10 +0800
Subject: [PATCH 011/134] memcg: enable accounting for signals

mainline inclusion
from mainline-v5.15-rc1
commit 5f58c39819ff78ca5ddbba2b3cd8ff4779b19bb5
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f58c39819ff78ca5ddbba2b3cd8ff4779b19bb5

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

When a user send a signal to any another processes it forces the kernel to
allocate memory for 'struct sigqueue' objects.  The number of signals is
limited by RLIMIT_SIGPENDING resource limit, but even the default settings
allow each user to consume up to several megabytes of memory.

It makes sense to account for these allocations to restrict the host's
memory consumption from inside the memcg-limited container.

Link: https://lkml.kernel.org/r/e34e958c-e785-712e-a62a-2c7b66c646c7@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Li Ming <limingming.li@huawei.com>

Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index ef8f2a28d37c..b125bc896138 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4595,7 +4595,7 @@ void __init signals_init(void)
 {
 	siginfo_buildtime_checks();
 
-	sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+	sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
 }
 
 #ifdef CONFIG_KGDB_KDB
-- 
Gitee


From 2e9dc96e1251320298b4e9d85b17e87268d6b492 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 19 Oct 2021 18:04:11 +0800
Subject: [PATCH 012/134] memcg: enable accounting for posix_timers_cache slab

mainline inclusion
from mainline-v5.15-rc1
commit c509723ec27e925bb91a20682c448e95d4bc8c9f
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c509723ec27e925bb91a20682c448e95d4bc8c9f

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

A program may create multiple interval timers using timer_create().  For
each timer the kernel preallocates a "queued real-time signal",
Consequently, the number of timers is limited by the RLIMIT_SIGPENDING
resource limit.  The allocated object is quite small, ~250 bytes, but even
the default signal limits allow to consume up to 100 megabytes per user.

It makes sense to account for them to limit the host's memory consumption
from inside the memcg-limited container.

Link: https://lkml.kernel.org/r/57795560-025c-267c-6b1a-dea852d95530@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Li Ming <limingming.li@huawei.com>

Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/time/posix-timers.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index dd5697d7347b..7363f81dc31a 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 static __init int init_posix_timers(void)
 {
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
-					sizeof (struct k_itimer), 0, SLAB_PANIC,
-					NULL);
+					sizeof(struct k_itimer), 0,
+					SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	return 0;
 }
 __initcall(init_posix_timers);
-- 
Gitee


From 8dc576ac3d68be1a90ab9b633fc3e3a5f6844096 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 19 Oct 2021 18:04:12 +0800
Subject: [PATCH 013/134] memcg: enable accounting for ldt_struct objects

mainline inclusion
from mainline-v5.15-rc1
commit ec403e2ae0dfc85996aad6e944a98a16e6dfcc6d
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ec403e2ae0dfc85996aad6e944a98a16e6dfcc6d

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

Each task can request own LDT and force the kernel to allocate up to 64Kb
memory per-mm.

There are legitimate workloads with hundreds of processes and there can be
hundreds of workloads running on large machines.  The unaccounted memory
can cause isolation issues between the workloads particularly on highly
utilized machines.

It makes sense to account for this objects to restrict the host's memory
consumption from inside the memcg-limited container.

Link: https://lkml.kernel.org/r/38010594-50fe-c06d-7cb0-d1f77ca422f3@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Acked-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yutian Yang <nglaive@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Li Ming <limingming.li@huawei.com>

Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 arch/x86/kernel/ldt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b8aee71840ae..7694c541e3d8 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 	if (num_entries > LDT_ENTRIES)
 		return NULL;
 
-	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
 	if (!new_ldt)
 		return NULL;
 
@@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 	 * than PAGE_SIZE.
 	 */
 	if (alloc_size > PAGE_SIZE)
-		new_ldt->entries = vzalloc(alloc_size);
+		new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	else
-		new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
+		new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 
 	if (!new_ldt->entries) {
 		kfree(new_ldt);
-- 
Gitee


From 7b79af940842ef215779b0d8011784af73ff8729 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang@vivo.com>
Date: Tue, 19 Oct 2021 18:04:15 +0800
Subject: [PATCH 014/134] kyber: introduce kyber_depth_updated()

mainline inclusion
from mainline-5.12-rc1
commit ffa772cfe9356ce94d3061335c2681f60e7c1c5b
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ffa772cfe9356ce94d3061335c2681f60e7c1c5b

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

Hang occurs when user changes the scheduler queue depth, by writing to
the 'nr_requests' sysfs file of that device.

The details of the environment that we found the problem are as follows:
  an eMMC block device
  total driver tags: 16
  default queue_depth: 32
  kqd->async_depth initialized in kyber_init_sched() with queue_depth=32

Then we change queue_depth to 256, by writing to the 'nr_requests' sysfs
file. But kqd->async_depth don't be updated after queue_depth changes.
Now the value of async depth is too small for queue_depth=256, this may
cause hang.

This patch introduces kyber_depth_updated(), so that kyber can update
async depth when queue depth changes.

Signed-off-by: Yang Yang <yang.yang@vivo.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/kyber-iosched.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 7f9ef773bf44..d2648356e430 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -353,19 +353,9 @@ static void kyber_timer_fn(struct timer_list *t)
 	}
 }
 
-static unsigned int kyber_sched_tags_shift(struct request_queue *q)
-{
-	/*
-	 * All of the hardware queues have the same depth, so we can just grab
-	 * the shift of the first one.
-	 */
-	return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
-}
-
 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 {
 	struct kyber_queue_data *kqd;
-	unsigned int shift;
 	int ret = -ENOMEM;
 	int i;
 
@@ -400,9 +390,6 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 		kqd->latency_targets[i] = kyber_latency_targets[i];
 	}
 
-	shift = kyber_sched_tags_shift(q);
-	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
-
 	return kqd;
 
 err_buckets:
@@ -458,9 +445,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
 		INIT_LIST_HEAD(&kcq->rq_list[i]);
 }
 
-static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
 {
 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+	struct blk_mq_tags *tags = hctx->sched_tags;
+	unsigned int shift = tags->bitmap_tags->sb.shift;
+
+	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+	sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
+}
+
+static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
 	struct kyber_hctx_data *khd;
 	int i;
 
@@ -502,8 +499,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 	khd->batching = 0;
 
 	hctx->sched_data = khd;
-	sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
-					kqd->async_depth);
+	kyber_depth_updated(hctx);
 
 	return 0;
 
@@ -1023,6 +1019,7 @@ static struct elevator_type kyber_sched = {
 		.completed_request = kyber_completed_request,
 		.dispatch_request = kyber_dispatch_request,
 		.has_work = kyber_has_work,
+		.depth_updated = kyber_depth_updated,
 	},
 #ifdef CONFIG_BLK_DEBUG_FS
 	.queue_debugfs_attrs = kyber_queue_debugfs_attrs,
-- 
Gitee


From 18bf7fed5aff9bcf8aa461261dba0567862d5aec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Oct 2021 18:04:16 +0800
Subject: [PATCH 015/134] switch file_open_root() to struct path

mainline inclusion
from mainline-5.14-rc1
commit ffb37ca3bd16ce6ea2df2f87fde9a31e94ebb54b
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ffb37ca3bd16ce6ea2df2f87fde9a31e94ebb54b

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

... and provide file_open_root_mnt(), using the root of given mount.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Conflicts:
	Documentation/filesystems/porting.rst
	[ Non-bugfix 14e43bf4356126("vfs: don't unnecessarily clone
	  write access for writable fd") is not applied. ]

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 Documentation/filesystems/porting.rst | 9 +++++++++
 arch/um/drivers/mconsole_kern.c       | 2 +-
 fs/coredump.c                         | 4 ++--
 fs/fhandle.c                          | 2 +-
 fs/internal.h                         | 2 +-
 fs/kernel_read_file.c                 | 2 +-
 fs/namei.c                            | 8 +++-----
 fs/open.c                             | 4 ++--
 fs/proc/proc_sysctl.c                 | 2 +-
 include/linux/fs.h                    | 9 ++++++++-
 kernel/usermode_driver.c              | 2 +-
 11 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 867036aa90b8..b00fb6313b58 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -865,3 +865,12 @@ no matter what.  Everything is handled by the caller.
 
 clone_private_mount() returns a longterm mount now, so the proper destructor of
 its result is kern_unmount() or kern_unmount_array().
+
+---
+
+**mandatory**
+
+Calling conventions for file_open_root() changed; now it takes struct path *
+instead of passing mount and dentry separately.  For callers that used to
+pass <mnt, mnt->mnt_root> pair (i.e. the root of given mount), a new helper
+is provided - file_open_root_mnt().  In-tree users adjusted.
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index a2e680f7d39f..6a22ead31c5b 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req)
 		mconsole_reply(req, "Proc not available", 1, 0);
 		goto out;
 	}
-	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
+	file = file_open_root_mnt(mnt, ptr, O_RDONLY, 0);
 	if (IS_ERR(file)) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
 		printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file));
diff --git a/fs/coredump.c b/fs/coredump.c
index c56a3bdce7cd..335c98787e66 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -755,8 +755,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			task_lock(&init_task);
 			get_fs_root(init_task.fs, &root);
 			task_unlock(&init_task);
-			cprm.file = file_open_root(root.dentry, root.mnt,
-				cn.corename, open_flags, 0600);
+			cprm.file = file_open_root(&root, cn.corename,
+						   open_flags, 0600);
 			path_put(&root);
 		} else {
 			cprm.file = filp_open(cn.corename, open_flags, 0600);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 01263ffbc4c0..718defdf1e0e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -229,7 +229,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 		path_put(&path);
 		return fd;
 	}
-	file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
+	file = file_open_root(&path, "", open_flag, 0);
 	if (IS_ERR(file)) {
 		put_unused_fd(fd);
 		retval =  PTR_ERR(file);
diff --git a/fs/internal.h b/fs/internal.h
index 5155f6ce95c7..0d4f7e4e2f3a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -128,7 +128,7 @@ struct open_flags {
 };
 extern struct file *do_filp_open(int dfd, struct filename *pathname,
 		const struct open_flags *op);
-extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+extern struct file *do_file_open_root(const struct path *,
 		const char *, const struct open_flags *);
 extern struct open_how build_open_how(int flags, umode_t mode);
 extern int build_open_flags(const struct open_how *how, struct open_flags *op);
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index c84d87f558cb..1b07550485b9 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -160,7 +160,7 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
 	get_fs_root(init_task.fs, &root);
 	task_unlock(&init_task);
 
-	file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0);
+	file = file_open_root(&root, path, O_RDONLY, 0);
 	path_put(&root);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
diff --git a/fs/namei.c b/fs/namei.c
index 4c9d0c36545d..130aa5694f48 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3393,7 +3393,7 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 	return filp;
 }
 
-struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+struct file *do_file_open_root(const struct path *root,
 		const char *name, const struct open_flags *op)
 {
 	struct nameidata nd;
@@ -3401,16 +3401,14 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	struct filename *filename;
 	int flags = op->lookup_flags | LOOKUP_ROOT;
 
-	nd.root.mnt = mnt;
-	nd.root.dentry = dentry;
-
-	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
+	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
 	filename = getname_kernel(name);
 	if (IS_ERR(filename))
 		return ERR_CAST(filename);
 
+	nd.root = *root;
 	set_nameidata(&nd, -1, filename);
 	file = path_openat(&nd, op, flags | LOOKUP_RCU);
 	if (unlikely(file == ERR_PTR(-ECHILD)))
diff --git a/fs/open.c b/fs/open.c
index 3aaaad47d9ca..7ce64c08bb40 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1149,7 +1149,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
 }
 EXPORT_SYMBOL(filp_open);
 
-struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+struct file *file_open_root(const struct path *root,
 			    const char *filename, int flags, umode_t mode)
 {
 	struct open_flags op;
@@ -1157,7 +1157,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	int err = build_open_flags(&how, &op);
 	if (err)
 		return ERR_PTR(err);
-	return do_file_open_root(dentry, mnt, filename, &op);
+	return do_file_open_root(root, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
 
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 070d2df8ab9c..ffed75f833b7 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1803,7 +1803,7 @@ static int process_sysctl_arg(char *param, char *val,
 		panic("%s: Failed to allocate path for %s\n", __func__, param);
 	strreplace(path, '.', '/');
 
-	file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0);
+	file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0);
 	if (IS_ERR(file)) {
 		err = PTR_ERR(file);
 		if (err == -ENOENT)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c4e366a3226e..3de2e6c714aa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -39,6 +39,7 @@
 #include <linux/fs_types.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
+#include <linux/mount.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -2527,8 +2528,14 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			umode_t mode);
 extern struct file *file_open_name(struct filename *, int, umode_t);
 extern struct file *filp_open(const char *, int, umode_t);
-extern struct file *file_open_root(struct dentry *, struct vfsmount *,
+extern struct file *file_open_root(const struct path *,
 				   const char *, int, umode_t);
+static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
+				   const char *name, int flags, umode_t mode)
+{
+	return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
+			      name, flags, mode);
+}
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
 extern struct file * open_with_fake_path(const struct path *, int,
 					 struct inode*, const struct cred *);
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index bb7bb3b478ab..9dae1f648713 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -26,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 	if (IS_ERR(mnt))
 		return mnt;
 
-	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
 	if (IS_ERR(file)) {
 		mntput(mnt);
 		return ERR_CAST(file);
-- 
Gitee


From 1c3315ab81dc4ae9599ed00ee3a574c1fccb35f1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Oct 2021 18:04:17 +0800
Subject: [PATCH 016/134] take LOOKUP_{ROOT,ROOT_GRABBED,JUMPED} out of
 LOOKUP_... space

mainline inclusion
from mainline-5.14-rc1
commit bcba1e7d0d520adba895d9e0800a056f734b0a6a
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bcba1e7d0d520adba895d9e0800a056f734b0a6a

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Separate field in nameidata (nd->state) holding the flags that
should be internal-only - that way we both get some spare bits
in LOOKUP_... and get simpler rules for nd->root lifetime rules,
since we can set the replacement of LOOKUP_ROOT (ND_ROOT_PRESET)
at the same time we set nd->root.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Conflicts:
	fs/namei.c
	[ Bugfix 7d01ef7585c0("Make sure nd->path.mnt and nd->path.dentry
	  are always valid pointers") is not applid, the problem to be
	  fixed not exists.
	  Feature 6c6ec2b0a3e0("fs: add support for LOOKUP_CACHED") is
	  not applied. ]

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 Documentation/filesystems/path-lookup.rst |  6 +--
 fs/namei.c                                | 54 +++++++++++++----------
 fs/nfs/nfstrace.h                         |  4 --
 include/linux/namei.h                     |  3 --
 4 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst
index c482e1619e77..ede67f705787 100644
--- a/Documentation/filesystems/path-lookup.rst
+++ b/Documentation/filesystems/path-lookup.rst
@@ -1321,18 +1321,18 @@ to lookup: RCU-walk, REF-walk, and REF-walk with forced revalidation.
 yet.  This is primarily used to tell the audit subsystem the full
 context of a particular access being audited.
 
-``LOOKUP_ROOT`` indicates that the ``root`` field in the ``nameidata`` was
+``ND_ROOT_PRESET`` indicates that the ``root`` field in the ``nameidata`` was
 provided by the caller, so it shouldn't be released when it is no
 longer needed.
 
-``LOOKUP_JUMPED`` means that the current dentry was chosen not because
+``ND_JUMPED`` means that the current dentry was chosen not because
 it had the right name but for some other reason.  This happens when
 following "``..``", following a symlink to ``/``, crossing a mount point
 or accessing a "``/proc/$PID/fd/$FD``" symlink (also known as a "magic
 link"). In this case the filesystem has not been asked to revalidate the
 name (with ``d_revalidate()``).  In such cases the inode may still need
 to be revalidated, so ``d_op->d_weak_revalidate()`` is called if
-``LOOKUP_JUMPED`` is set when the look completes - which may be at the
+``ND_JUMPED`` is set when the look completes - which may be at the
 final component or, when creating, unlinking, or renaming, at the penultimate component.
 
 Resolution-restriction flags
diff --git a/fs/namei.c b/fs/namei.c
index 130aa5694f48..c94a814e86b2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -504,7 +504,7 @@ struct nameidata {
 	struct qstr	last;
 	struct path	root;
 	struct inode	*inode; /* path.dentry.d_inode */
-	unsigned int	flags;
+	unsigned int	flags, state;
 	unsigned	seq, m_seq, r_seq;
 	int		last_type;
 	unsigned	depth;
@@ -523,6 +523,10 @@ struct nameidata {
 	umode_t		dir_mode;
 } __randomize_layout;
 
+#define ND_ROOT_PRESET 1
+#define ND_ROOT_GRABBED 2
+#define ND_JUMPED 4
+
 static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 {
 	struct nameidata *old = current->nameidata;
@@ -531,6 +535,7 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 	p->name = name;
 	p->total_link_count = old ? old->total_link_count : 0;
 	p->saved = old;
+	p->state = 0;
 	current->nameidata = p;
 }
 
@@ -593,9 +598,9 @@ static void terminate_walk(struct nameidata *nd)
 		path_put(&nd->path);
 		for (i = 0; i < nd->depth; i++)
 			path_put(&nd->stack[i].link);
-		if (nd->flags & LOOKUP_ROOT_GRABBED) {
+		if (nd->state & ND_ROOT_GRABBED) {
 			path_put(&nd->root);
-			nd->flags &= ~LOOKUP_ROOT_GRABBED;
+			nd->state &= ~ND_ROOT_GRABBED;
 		}
 	} else {
 		nd->flags &= ~LOOKUP_RCU;
@@ -651,9 +656,9 @@ static bool legitimize_root(struct nameidata *nd)
 	if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
 		return false;
 	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
-	if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
+	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
 		return true;
-	nd->flags |= LOOKUP_ROOT_GRABBED;
+	nd->state |= ND_ROOT_GRABBED;
 	return legitimize_path(nd, &nd->root, nd->root_seq);
 }
 
@@ -790,8 +795,9 @@ static int complete_walk(struct nameidata *nd)
 		 * We don't want to zero nd->root for scoped-lookups or
 		 * externally-managed nd->root.
 		 */
-		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
-			nd->root.mnt = NULL;
+		if (!(nd->state & ND_ROOT_PRESET))
+			if (!(nd->flags & LOOKUP_IS_SCOPED))
+				nd->root.mnt = NULL;
 		if (!try_to_unlazy(nd))
 			return -ECHILD;
 	}
@@ -817,7 +823,7 @@ static int complete_walk(struct nameidata *nd)
 			return -EXDEV;
 	}
 
-	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+	if (likely(!(nd->state & ND_JUMPED)))
 		return 0;
 
 	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
@@ -855,7 +861,7 @@ static int set_root(struct nameidata *nd)
 		} while (read_seqcount_retry(&fs->seq, seq));
 	} else {
 		get_fs_root(fs, &nd->root);
-		nd->flags |= LOOKUP_ROOT_GRABBED;
+		nd->state |= ND_ROOT_GRABBED;
 	}
 	return 0;
 }
@@ -888,7 +894,7 @@ static int nd_jump_root(struct nameidata *nd)
 		path_get(&nd->path);
 		nd->inode = nd->path.dentry->d_inode;
 	}
-	nd->flags |= LOOKUP_JUMPED;
+	nd->state |= ND_JUMPED;
 	return 0;
 }
 
@@ -916,7 +922,7 @@ int nd_jump_link(struct path *path)
 	path_put(&nd->path);
 	nd->path = *path;
 	nd->inode = nd->path.dentry->d_inode;
-	nd->flags |= LOOKUP_JUMPED;
+	nd->state |= ND_JUMPED;
 	return 0;
 
 err:
@@ -1338,7 +1344,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			if (mounted) {
 				path->mnt = &mounted->mnt;
 				dentry = path->dentry = mounted->mnt.mnt_root;
-				nd->flags |= LOOKUP_JUMPED;
+				nd->state |= ND_JUMPED;
 				*seqp = read_seqcount_begin(&dentry->d_seq);
 				*inode = dentry->d_inode;
 				/*
@@ -1383,7 +1389,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 			ret = -EXDEV;
 		else
-			nd->flags |= LOOKUP_JUMPED;
+			nd->state |= ND_JUMPED;
 	}
 	if (unlikely(ret)) {
 		dput(path->dentry);
@@ -2129,7 +2135,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			case 2:
 				if (name[1] == '.') {
 					type = LAST_DOTDOT;
-					nd->flags |= LOOKUP_JUMPED;
+					nd->state |= ND_JUMPED;
 				}
 				break;
 			case 1:
@@ -2137,7 +2143,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		}
 		if (likely(type == LAST_NORM)) {
 			struct dentry *parent = nd->path.dentry;
-			nd->flags &= ~LOOKUP_JUMPED;
+			nd->state &= ~ND_JUMPED;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
 				struct qstr this = { { .hash_len = hash_len }, .name = name };
 				err = parent->d_op->d_hash(parent, &this);
@@ -2207,14 +2213,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	if (flags & LOOKUP_RCU)
 		rcu_read_lock();
 
-	nd->flags = flags | LOOKUP_JUMPED;
+	nd->flags = flags;
+	nd->state |= ND_JUMPED;
 	nd->depth = 0;
 
 	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
 	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
 	smp_rmb();
 
-	if (flags & LOOKUP_ROOT) {
+	if (nd->state & ND_ROOT_PRESET) {
 		struct dentry *root = nd->root.dentry;
 		struct inode *inode = root->d_inode;
 		if (*s && unlikely(!d_can_lookup(root)))
@@ -2291,7 +2298,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			nd->root_seq = nd->seq;
 		} else {
 			path_get(&nd->root);
-			nd->flags |= LOOKUP_ROOT_GRABBED;
+			nd->state |= ND_ROOT_GRABBED;
 		}
 	}
 	return s;
@@ -2330,7 +2337,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 		;
 	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
 		err = handle_lookup_down(nd);
-		nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
+		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
 	}
 	if (!err)
 		err = complete_walk(nd);
@@ -2354,11 +2361,11 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
 	struct nameidata nd;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
+	set_nameidata(&nd, dfd, name);
 	if (unlikely(root)) {
 		nd.root = *root;
-		flags |= LOOKUP_ROOT;
+		nd.state = ND_ROOT_PRESET;
 	}
-	set_nameidata(&nd, dfd, name);
 	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
 	if (unlikely(retval == -ECHILD))
 		retval = path_lookupat(&nd, flags, path);
@@ -3399,7 +3406,7 @@ struct file *do_file_open_root(const struct path *root,
 	struct nameidata nd;
 	struct file *file;
 	struct filename *filename;
-	int flags = op->lookup_flags | LOOKUP_ROOT;
+	int flags = op->lookup_flags;
 
 	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
@@ -3408,8 +3415,9 @@ struct file *do_file_open_root(const struct path *root,
 	if (IS_ERR(filename))
 		return ERR_CAST(filename);
 
-	nd.root = *root;
 	set_nameidata(&nd, -1, filename);
+	nd.root = *root;
+	nd.state = ND_ROOT_PRESET;
 	file = path_openat(&nd, op, flags | LOOKUP_RCU);
 	if (unlikely(file == ERR_PTR(-ECHILD)))
 		file = path_openat(&nd, op, flags);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 5a59dcdce0b2..cb7f49723bbf 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -271,8 +271,6 @@ TRACE_DEFINE_ENUM(LOOKUP_OPEN);
 TRACE_DEFINE_ENUM(LOOKUP_CREATE);
 TRACE_DEFINE_ENUM(LOOKUP_EXCL);
 TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
-TRACE_DEFINE_ENUM(LOOKUP_JUMPED);
-TRACE_DEFINE_ENUM(LOOKUP_ROOT);
 TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
 TRACE_DEFINE_ENUM(LOOKUP_DOWN);
 
@@ -288,8 +286,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN);
 			{ LOOKUP_CREATE, "CREATE" }, \
 			{ LOOKUP_EXCL, "EXCL" }, \
 			{ LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
-			{ LOOKUP_JUMPED, "JUMPED" }, \
-			{ LOOKUP_ROOT, "ROOT" }, \
 			{ LOOKUP_EMPTY, "EMPTY" }, \
 			{ LOOKUP_DOWN, "DOWN" })
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a4bb992623c4..ca94eb5d2b16 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -36,9 +36,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 
 /* internal use only */
 #define LOOKUP_PARENT		0x0010
-#define LOOKUP_JUMPED		0x1000
-#define LOOKUP_ROOT		0x2000
-#define LOOKUP_ROOT_GRABBED	0x0008
 
 /* Scoping flags for lookup. */
 #define LOOKUP_NO_SYMLINKS	0x010000 /* No symlink crossing. */
-- 
Gitee


From baf3f0abd6b12d6044b4e8319a55bcae543614b5 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Tue, 19 Oct 2021 18:04:18 +0800
Subject: [PATCH 017/134] ext4: fix potential uninitialized access to retval in
 kmmpd

mainline inclusion
from mainline-5.14-rc5
commit b66541422824cf6cf20e9a35112e9cb5d82cdf62
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b66541422824cf6cf20e9a35112e9cb5d82cdf62

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

if (!ext4_has_feature_mmp(sb)) then retval can be unitialized before
we jump to the wait_to_exit label.

Fixes: 61bb4a1c417e ("ext4: fix possible UAF when remounting r/o a mmp-protected file system")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Link: https://lore.kernel.org/r/20210713022728.2533770-1-yebin10@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/mmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index bc364c119af6..cebea4270817 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -138,7 +138,7 @@ static int kmmpd(void *data)
 	unsigned mmp_check_interval;
 	unsigned long last_update_time;
 	unsigned long diff;
-	int retval;
+	int retval = 0;
 
 	mmp_block = le64_to_cpu(es->s_mmp_block);
 	mmp = (struct mmp_struct *)(bh->b_data);
-- 
Gitee


From fd1e38dc97d4ed4a9682775d5848d540408bafb6 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 21 Oct 2021 21:05:50 +0800
Subject: [PATCH 018/134] blk-mq-sched: Fix blk_mq_sched_alloc_tags() error
 handling

mainline inclusion
from mainline-v5.14-rc1
commit b93af3055d6f32d3b0361cfdb110c9399c1241ba
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b93af3055d6f32d3b0361cfdb110c9399c1241ba

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

If the blk_mq_sched_alloc_tags() -> blk_mq_alloc_rqs() call fails, then we
call blk_mq_sched_free_tags() -> blk_mq_free_rqs().

It is incorrect to do so, as any rqs would have already been freed in the
blk_mq_alloc_rqs() call.

Fix by calling blk_mq_free_rq_map() only directly.

Fixes: 6917ff0b5bd41 ("blk-mq-sched: refactor scheduler initialization")
Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/1627378373-148090-1-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-mq-sched.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 24c08963890e..3fea30f0ea27 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -506,19 +506,6 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 	percpu_ref_put(&q->q_usage_counter);
 }
 
-static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
-				   struct blk_mq_hw_ctx *hctx,
-				   unsigned int hctx_idx)
-{
-	unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
-
-	if (hctx->sched_tags) {
-		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
-		blk_mq_free_rq_map(hctx->sched_tags, flags);
-		hctx->sched_tags = NULL;
-	}
-}
-
 static int blk_mq_sched_alloc_tags(struct request_queue *q,
 				   struct blk_mq_hw_ctx *hctx,
 				   unsigned int hctx_idx)
@@ -534,8 +521,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 		return -ENOMEM;
 
 	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
-	if (ret)
-		blk_mq_sched_free_tags(set, hctx, hctx_idx);
+	if (ret) {
+		blk_mq_free_rq_map(hctx->sched_tags, set->flags);
+		hctx->sched_tags = NULL;
+	}
 
 	return ret;
 }
-- 
Gitee


From f856741e4ea1481df28ad464481ed83fe2965fa1 Mon Sep 17 00:00:00 2001
From: Zhenhua Huang <zhenhuah@codeaurora.org>
Date: Tue, 28 Sep 2021 11:51:49 +0800
Subject: [PATCH 019/134] mm: fix page_owner initializing issue for arm32

mainline inclusion
from mainline-v5.11-rc1
commit 7fb7ab6d618a4dc7ea3f3eafc92388a35b4f8894
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

Page owner of pages used by page owner itself used is missing on arm32
targets.  The reason is dummy_handle and failure_handle is not initialized
correctly.  Buddy allocator is used to initialize these two handles.
However, buddy allocator is not ready when page owner calls it.  This
change fixed that by initializing page owner after buddy initialization.

The working flow before and after this change are:
original logic:
 1. allocated memory for page_ext(using memblock).
 2. invoke the init callback of page_ext_ops like page_owner(using buddy
    allocator).
 3. initialize buddy.

after this change:
 1. allocated memory for page_ext(using memblock).
 2. initialize buddy.
 3. invoke the init callback of page_ext_ops like page_owner(using buddy
    allocator).

with the change, failure/dummy_handle can get its correct value and page
owner output for example has the one for page owner itself:

  Page allocated via order 2, mask 0x6202c0(GFP_USER|__GFP_NOWARN),
  pid 1006, ts 67278156558 ns
  PFN 543776 type Unmovable Block 531 type Unmovable Flags 0x0()
    init_page_owner+0x28/0x2f8
    invoke_init_callbacks_flatmem+0x24/0x34
    start_kernel+0x33c/0x5d8

Link: https://lkml.kernel.org/r/1603104925-5888-1-git-send-email-zhenhuah@codeaurora.org
Signed-off-by: Zhenhua Huang <zhenhuah@codeaurora.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 7fb7ab6d618a4dc7ea3f3eafc92388a35b4f8894)
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Yuanzheng Song <songyuanzheng@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 include/linux/page_ext.h |  8 ++++++++
 init/main.c              |  2 ++
 mm/page_ext.c            | 10 ++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index cfce186f0c4e..aff81ba31bd8 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -44,8 +44,12 @@ static inline void page_ext_init_flatmem(void)
 {
 }
 extern void page_ext_init(void);
+static inline void page_ext_init_flatmem_late(void)
+{
+}
 #else
 extern void page_ext_init_flatmem(void);
+extern void page_ext_init_flatmem_late(void);
 static inline void page_ext_init(void)
 {
 }
@@ -76,6 +80,10 @@ static inline void page_ext_init(void)
 {
 }
 
+static inline void page_ext_init_flatmem_late(void)
+{
+}
+
 static inline void page_ext_init_flatmem(void)
 {
 }
diff --git a/init/main.c b/init/main.c
index dd26a42e80a8..63cdf49f7f82 100644
--- a/init/main.c
+++ b/init/main.c
@@ -828,6 +828,8 @@ static void __init mm_init(void)
 	init_debug_pagealloc();
 	report_meminit();
 	mem_init();
+	/* page_owner must be initialized after buddy is ready */
+	page_ext_init_flatmem_late();
 	kmem_cache_init();
 	kmemleak_init();
 	pgtable_init();
diff --git a/mm/page_ext.c b/mm/page_ext.c
index a3616f7a0e9e..16b161f28a31 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -99,12 +99,19 @@ static void __init invoke_init_callbacks(void)
 	}
 }
 
+#ifndef CONFIG_SPARSEMEM
+void __init page_ext_init_flatmem_late(void)
+{
+	invoke_init_callbacks();
+}
+#endif
+
 static inline struct page_ext *get_entry(void *base, unsigned long index)
 {
 	return base + page_ext_size * index;
 }
 
-#if !defined(CONFIG_SPARSEMEM)
+#ifndef CONFIG_SPARSEMEM
 
 
 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
@@ -177,7 +184,6 @@ void __init page_ext_init_flatmem(void)
 			goto fail;
 	}
 	pr_info("allocated %ld bytes of page_ext\n", total_usage);
-	invoke_init_callbacks();
 	return;
 
 fail:
-- 
Gitee


From b7e60f215e0d7160e5803db914874427a3e80d89 Mon Sep 17 00:00:00 2001
From: Liam Mark <lmark@codeaurora.org>
Date: Tue, 6 Jul 2021 11:33:56 +0000
Subject: [PATCH 020/134] mm/page_owner: record timestamp and pid

mainline inclusion
from mainline-5.11-rc1
commit 9cc7e96aa846f9086431d6c2d33ff9ab42d72b2d
category: feature
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9cc7e96aa846f9086431d6c2d33ff9ab42d72b2d

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

Collect the time for each allocation recorded in page owner so that
allocation "surges" can be measured.

Record the pid for each allocation recorded in page owner so that the
source of allocation "surges" can be better identified.

The above is very useful when doing memory analysis.  On a crash for
example, we can get this information from kdump (or ramdump) and parse it
to figure out memory allocation problems.

Please note that on x86_64 this increases the size of struct page_owner
from 16 bytes to 32.

Vlastimil: it's not a functionality intended for production, so unless
somebody says they need to enable page_owner for debugging and this
increase prevents them from fitting into available memory, let's not
complicate things with making this optional.

[lmark@codeaurora.org: v3]
  Link: https://lkml.kernel.org/r/20201210160357.27779-1-georgi.djakov@linaro.org

Link: https://lkml.kernel.org/r/20201209125153.10533-1-georgi.djakov@linaro.org
Signed-off-by: Liam Mark <lmark@codeaurora.org>
Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 9cc7e96aa846f9086431d6c2d33ff9ab42d72b2d)
Signed-off-by: Yongqiang Liu <liuyongqiang13@huawei.com>
Reviewed-by: tong tiangen <tongtiangen@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 Documentation/vm/page_owner.rst | 12 ++++++------
 mm/page_owner.c                 | 17 +++++++++++++----
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
index 02deac76673f..4e67c2e9bbed 100644
--- a/Documentation/vm/page_owner.rst
+++ b/Documentation/vm/page_owner.rst
@@ -41,17 +41,17 @@ size change due to this facility.
 - Without page owner::
 
    text    data     bss     dec     hex filename
-   40662   1493     644   42799    a72f mm/page_alloc.o
+   48392   2333     644   51369    c8a9 mm/page_alloc.o
 
 - With page owner::
 
    text    data     bss     dec     hex filename
-   40892   1493     644   43029    a815 mm/page_alloc.o
-   1427      24       8    1459     5b3 mm/page_ext.o
-   2722      50       0    2772     ad4 mm/page_owner.o
+   48800   2445     644   51889    cab1 mm/page_alloc.o
+   6574     108      29    6711    1a37 mm/page_owner.o
+   1025       8       8    1041     411 mm/page_ext.o
 
-Although, roughly, 4 KB code is added in total, page_alloc.o increase by
-230 bytes and only half of it is in hotpath. Building the kernel with
+Although, roughly, 8 KB code is added in total, page_alloc.o increase by
+520 bytes and less than half of it is in hotpath. Building the kernel with
 page owner and turning it on if needed would be great option to debug
 kernel memory problem.
 
diff --git a/mm/page_owner.c b/mm/page_owner.c
index b735a8eafcdb..af464bb7fbe7 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -10,6 +10,7 @@
 #include <linux/migrate.h>
 #include <linux/stackdepot.h>
 #include <linux/seq_file.h>
+#include <linux/sched/clock.h>
 
 #include "internal.h"
 
@@ -25,6 +26,8 @@ struct page_owner {
 	gfp_t gfp_mask;
 	depot_stack_handle_t handle;
 	depot_stack_handle_t free_handle;
+	u64 ts_nsec;
+	pid_t pid;
 };
 
 static bool page_owner_enabled = false;
@@ -172,6 +175,8 @@ static inline void __set_page_owner_handle(struct page *page,
 		page_owner->order = order;
 		page_owner->gfp_mask = gfp_mask;
 		page_owner->last_migrate_reason = -1;
+		page_owner->pid = current->pid;
+		page_owner->ts_nsec = local_clock();
 		__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 		__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
 
@@ -236,6 +241,8 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 	new_page_owner->last_migrate_reason =
 		old_page_owner->last_migrate_reason;
 	new_page_owner->handle = old_page_owner->handle;
+	new_page_owner->pid = old_page_owner->pid;
+	new_page_owner->ts_nsec = old_page_owner->ts_nsec;
 
 	/*
 	 * We don't clear the bit on the oldpage as it's going to be freed
@@ -349,9 +356,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		return -ENOMEM;
 
 	ret = snprintf(kbuf, count,
-			"Page allocated via order %u, mask %#x(%pGg)\n",
+			"Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n",
 			page_owner->order, page_owner->gfp_mask,
-			&page_owner->gfp_mask);
+			&page_owner->gfp_mask, page_owner->pid,
+			page_owner->ts_nsec);
 
 	if (ret >= count)
 		goto err;
@@ -427,8 +435,9 @@ void __dump_page_owner(struct page *page)
 	else
 		pr_alert("page_owner tracks the page as freed\n");
 
-	pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
-		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+	pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n",
+		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
+		 page_owner->pid, page_owner->ts_nsec);
 
 	handle = READ_ONCE(page_owner->handle);
 	if (!handle) {
-- 
Gitee


From 0f69ec3839b52fb1fc968fc7186aab8b74da07ef Mon Sep 17 00:00:00 2001
From: Georgi Djakov <georgi.djakov@linaro.org>
Date: Tue, 6 Jul 2021 11:33:57 +0000
Subject: [PATCH 021/134] mm/page_owner: record the timestamp of all pages
 during free

mainline inclusion
from mainline-5.13-rc1
commit 866b485262173a2b873386162b2ddcfbcb542b4a
category: feature
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=866b485262173a2b873386162b2ddcfbcb542b4a

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

Collect the time when each allocation is freed, to help with memory
analysis with kdump/ramdump.  Add the timestamp also in the page_owner
debugfs file and print it in dump_page().

Having another timestamp when we free the page helps for debugging page
migration issues.  For example both alloc and free timestamps being the
same can gave hints that there is an issue with migrating memory, as
opposed to a page just being dropped during migration.

Link: https://lkml.kernel.org/r/20210203175905.12267-1-georgi.djakov@linaro.org
Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 866b485262173a2b873386162b2ddcfbcb542b4a)
Signed-off-by: Yongqiang Liu <liuyongqiang13@huawei.com>
Reviewed-by: tong tiangen <tongtiangen@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 Documentation/vm/page_owner.rst |  2 +-
 mm/page_owner.c                 | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
index 4e67c2e9bbed..2175465c9bf2 100644
--- a/Documentation/vm/page_owner.rst
+++ b/Documentation/vm/page_owner.rst
@@ -47,7 +47,7 @@ size change due to this facility.
 
    text    data     bss     dec     hex filename
    48800   2445     644   51889    cab1 mm/page_alloc.o
-   6574     108      29    6711    1a37 mm/page_owner.o
+   6662     108      29    6799    1a8f mm/page_owner.o
    1025       8       8    1041     411 mm/page_ext.o
 
 Although, roughly, 8 KB code is added in total, page_alloc.o increase by
diff --git a/mm/page_owner.c b/mm/page_owner.c
index af464bb7fbe7..5b93fc85dc73 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -27,6 +27,7 @@ struct page_owner {
 	depot_stack_handle_t handle;
 	depot_stack_handle_t free_handle;
 	u64 ts_nsec;
+	u64 free_ts_nsec;
 	pid_t pid;
 };
 
@@ -148,6 +149,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
 	struct page_ext *page_ext;
 	depot_stack_handle_t handle = 0;
 	struct page_owner *page_owner;
+	u64 free_ts_nsec = local_clock();
 
 	handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
 
@@ -158,6 +160,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
 		__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
 		page_owner = get_page_owner(page_ext);
 		page_owner->free_handle = handle;
+		page_owner->free_ts_nsec = free_ts_nsec;
 		page_ext = page_ext_next(page_ext);
 	}
 }
@@ -243,6 +246,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 	new_page_owner->handle = old_page_owner->handle;
 	new_page_owner->pid = old_page_owner->pid;
 	new_page_owner->ts_nsec = old_page_owner->ts_nsec;
+	new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
 
 	/*
 	 * We don't clear the bit on the oldpage as it's going to be freed
@@ -356,10 +360,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		return -ENOMEM;
 
 	ret = snprintf(kbuf, count,
-			"Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n",
+			"Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
 			page_owner->order, page_owner->gfp_mask,
 			&page_owner->gfp_mask, page_owner->pid,
-			page_owner->ts_nsec);
+			page_owner->ts_nsec, page_owner->free_ts_nsec);
 
 	if (ret >= count)
 		goto err;
@@ -435,9 +439,9 @@ void __dump_page_owner(struct page *page)
 	else
 		pr_alert("page_owner tracks the page as freed\n");
 
-	pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n",
+	pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
 		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
-		 page_owner->pid, page_owner->ts_nsec);
+		 page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
 
 	handle = READ_ONCE(page_owner->handle);
 	if (!handle) {
-- 
Gitee


From 908db6a575229e548cf86607d2a019bd408f71f6 Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@gentoo.org>
Date: Tue, 28 Sep 2021 11:51:51 +0800
Subject: [PATCH 022/134] mm: page_poison: print page info when corruption is
 caught

mainline inclusion
from mainline-v5.13-rc1
commit f58bd538e6a2deb2bcdfe527d9ed45643348a4e6
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

When page_poison detects page corruption it's useful to see who freed a
page recently to have a guess where write-after-free corruption happens.

After this change corruption report has extra page data.
Example report from real corruption (includes only page_pwner part):

    pagealloc: memory corruption
    e00000014cd61d10: 11 00 00 00 00 00 00 00 30 1d d2 ff ff 0f 00 60  ........0......`
    e00000014cd61d20: b0 1d d2 ff ff 0f 00 60 90 fe 1c 00 08 00 00 20  .......`.......
    ...
    CPU: 1 PID: 220402 Comm: cc1plus Not tainted 5.12.0-rc5-00107-g9720c6f59ecf #245
    Hardware name: hp server rx3600, BIOS 04.03 04/08/2008
    ...
    Call Trace:
     [<a000000100015210>] show_stack+0x90/0xc0
     [<a000000101163390>] dump_stack+0x150/0x1c0
     [<a0000001003f1e90>] __kernel_unpoison_pages+0x410/0x440
     [<a0000001003c2460>] get_page_from_freelist+0x1460/0x2ca0
     [<a0000001003c6be0>] __alloc_pages_nodemask+0x3c0/0x660
     [<a0000001003ed690>] alloc_pages_vma+0xb0/0x500
     [<a00000010037deb0>] __handle_mm_fault+0x1230/0x1fe0
     [<a00000010037ef70>] handle_mm_fault+0x310/0x4e0
     [<a00000010005dc70>] ia64_do_page_fault+0x1f0/0xb80
     [<a00000010000ca00>] ia64_leave_kernel+0x0/0x270
    page_owner tracks the page as freed
    page allocated via order 0, migratetype Movable,
      gfp_mask 0x100dca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), pid 37, ts 8173444098740
     __reset_page_owner+0x40/0x200
     free_pcp_prepare+0x4d0/0x600
     free_unref_page+0x20/0x1c0
     __put_page+0x110/0x1a0
     migrate_pages+0x16d0/0x1dc0
     compact_zone+0xfc0/0x1aa0
     proactive_compact_node+0xd0/0x1e0
     kcompactd+0x550/0x600
     kthread+0x2c0/0x2e0
     call_payload+0x50/0x80

Here we can see that page was freed by page migration but something
managed to write to it afterwards.

[slyfox@gentoo.org: s/dump_page_owner/dump_page/, per Vlastimil]
  Link: https://lkml.kernel.org/r/20210407230800.1086854-1-slyfox@gentoo.org

Link: https://lkml.kernel.org/r/20210404141735.2152984-1-slyfox@gentoo.org
Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit f58bd538e6a2deb2bcdfe527d9ed45643348a4e6)
Signed-off-by: Yuanzheng Song <songyuanzheng@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 mm/page_poison.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/page_poison.c b/mm/page_poison.c
index a08423cb11ca..51f9cb913e0c 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/mmdebug.h>
 #include <linux/highmem.h>
 #include <linux/page_ext.h>
 #include <linux/poison.h>
@@ -73,7 +74,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
 	return error && !(error & (error - 1));
 }
 
-static void check_poison_mem(unsigned char *mem, size_t bytes)
+static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes)
 {
 	static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
 	unsigned char *start;
@@ -101,6 +102,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
 	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
 			end - start + 1, 1);
 	dump_stack();
+	dump_page(page, "pagealloc: corrupted page details");
 }
 
 static void unpoison_page(struct page *page)
@@ -114,7 +116,7 @@ static void unpoison_page(struct page *page)
 	 * that is freed to buddy. Thus no extra check is done to
 	 * see if a page was poisoned.
 	 */
-	check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE);
+	check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE);
 	kasan_enable_current();
 	kunmap_atomic(addr);
 }
-- 
Gitee


From f23a7b4f94a206cdfda24b6cd5029216f111323d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 21 Oct 2021 21:05:55 +0800
Subject: [PATCH 023/134] mm: vmscan: fix missing psi annotation for
 node_reclaim()

mainline inclusion
from mainline-5.14-rc7
commit 57f29762cdd4687a02f245d1b1e78de046388eac
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=57f29762cdd4687a02f245d1b1e78de046388eac

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
------------------------------------------------

In a debugging session the other day, Rik noticed that node_reclaim()
was missing memstall annotations.  This means we'll miss pressure and
lost productivity resulting from reclaim on an overloaded local NUMA
node when vm.zone_reclaim_mode is enabled.

There haven't been any reports, but that's likely because
vm.zone_reclaim_mode hasn't been a commonly used feature recently, and
the intersection between such setups and psi users is probably nil.

But secondary memory such as CXL-connected DIMMS, persistent memory etc,
and the page demotion patches that handle them
(https://lore.kernel.org/lkml/20210401183216.443C4443@viggo.jf.intel.com/)
could soon make this a more common codepath again.

Link: https://lkml.kernel.org/r/20210818152457.35846-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jing Xiangfeng <jingxiangfeng@huawei.com>
Reviewed-by: Chen Wandun <chenwandun@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 mm/vmscan.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6a6dd347bcb3..ebaa10d4dcad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4174,11 +4174,13 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_swap = 1,
 		.reclaim_idx = gfp_zone(gfp_mask),
 	};
+	unsigned long pflags;
 
 	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
 					   sc.gfp_mask);
 
 	cond_resched();
+	psi_memstall_enter(&pflags);
 	fs_reclaim_acquire(sc.gfp_mask);
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
@@ -4203,6 +4205,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	current->flags &= ~PF_SWAPWRITE;
 	memalloc_noreclaim_restore(noreclaim_flag);
 	fs_reclaim_release(sc.gfp_mask);
+	psi_memstall_leave(&pflags);
 
 	trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
 
-- 
Gitee


From 8f26c74c12a9d5958b76b5cad735893a8973c3af Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 21 Oct 2021 21:05:56 +0800
Subject: [PATCH 024/134] mm: slub: fix slub_debug disabling for list of slabs

mainline inclusion
from mainline-5.14-rc6
commit a7f1d48585b34730765dcda09ead6edc4ac16a5c
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a7f1d48585b34730765dcda09ead6edc4ac16a5c

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
------------------------------------------------

Vijayanand Jitta reports:

  Consider the scenario where CONFIG_SLUB_DEBUG_ON is set and we would
  want to disable slub_debug for few slabs. Using boot parameter with
  slub_debug=-,slab_name syntax doesn't work as expected i.e; only
  disabling debugging for the specified list of slabs. Instead it
  disables debugging for all slabs, which is wrong.

This patch fixes it by delaying the moment when the global slub_debug
flags variable is updated.  In case a "slub_debug=-,slab_name" has been
passed, the global flags remain as initialized (depending on
CONFIG_SLUB_DEBUG_ON enabled or disabled) and are not simply reset to 0.

Link: https://lkml.kernel.org/r/8a3d992a-473a-467b-28a0-4ad2ff60ab82@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Vijayanand Jitta <vjitta@codeaurora.org>
Reviewed-by: Vijayanand Jitta <vjitta@codeaurora.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	mm/slub.c
Signed-off-by: Jing Xiangfeng <jingxiangfeng@huawei.com>
Reviewed-by: Chen Wandun <chenwandun@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 mm/slub.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 1384dc906833..e6c532123de1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1346,12 +1346,13 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
 static int __init setup_slub_debug(char *str)
 {
 	slab_flags_t flags;
+	slab_flags_t global_flags;
 	char *saved_str;
 	char *slab_list;
 	bool global_slub_debug_changed = false;
 	bool slab_list_specified = false;
 
-	slub_debug = DEBUG_DEFAULT_FLAGS;
+	global_flags = DEBUG_DEFAULT_FLAGS;
 	if (*str++ != '=' || !*str)
 		/*
 		 * No options specified. Switch on full debugging.
@@ -1363,7 +1364,7 @@ static int __init setup_slub_debug(char *str)
 		str = parse_slub_debug_flags(str, &flags, &slab_list, true);
 
 		if (!slab_list) {
-			slub_debug = flags;
+			global_flags = flags;
 			global_slub_debug_changed = true;
 		} else {
 			slab_list_specified = true;
@@ -1372,16 +1373,18 @@ static int __init setup_slub_debug(char *str)
 
 	/*
 	 * For backwards compatibility, a single list of flags with list of
-	 * slabs means debugging is only enabled for those slabs, so the global
-	 * slub_debug should be 0. We can extended that to multiple lists as
+	 * slabs means debugging is only changed for those slabs, so the global
+	 * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
+	 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
 	 * long as there is no option specifying flags without a slab list.
 	 */
 	if (slab_list_specified) {
 		if (!global_slub_debug_changed)
-			slub_debug = 0;
+			global_flags = slub_debug;
 		slub_debug_string = saved_str;
 	}
 out:
+	slub_debug = global_flags;
 	if (slub_debug != 0 || slub_debug_string)
 		static_branch_enable(&slub_debug_enabled);
 	if ((static_branch_unlikely(&init_on_alloc) ||
-- 
Gitee


From cab36ab54790cecc58a9bc3d8458823360f1f396 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Thu, 21 Oct 2021 21:06:02 +0800
Subject: [PATCH 025/134] ext4: flush s_error_work before journal destroy in
 ext4_fill_super

mainline inclusion
from mainline-v5.15-rc4
commit bb9464e08309f6befe80866f5be51778ca355ee9
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb9464e08309f6befe80866f5be51778ca355ee9

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

The error path in ext4_fill_super forget to flush s_error_work before
journal destroy, and it may trigger the follow bug since
flush_stashed_error_work can run concurrently with journal destroy
without any protection for sbi->s_journal.

[32031.740193] EXT4-fs (loop66): get root inode failed
[32031.740484] EXT4-fs (loop66): mount failed
[32031.759805] ------------[ cut here ]------------
[32031.759807] kernel BUG at fs/jbd2/transaction.c:373!
[32031.760075] invalid opcode: 0000 [#1] SMP PTI
[32031.760336] CPU: 5 PID: 1029268 Comm: kworker/5:1 Kdump: loaded
4.18.0
[32031.765112] Call Trace:
[32031.765375]  ? __switch_to_asm+0x35/0x70
[32031.765635]  ? __switch_to_asm+0x41/0x70
[32031.765893]  ? __switch_to_asm+0x35/0x70
[32031.766148]  ? __switch_to_asm+0x41/0x70
[32031.766405]  ? _cond_resched+0x15/0x40
[32031.766665]  jbd2__journal_start+0xf1/0x1f0 [jbd2]
[32031.766934]  jbd2_journal_start+0x19/0x20 [jbd2]
[32031.767218]  flush_stashed_error_work+0x30/0x90 [ext4]
[32031.767487]  process_one_work+0x195/0x390
[32031.767747]  worker_thread+0x30/0x390
[32031.768007]  ? process_one_work+0x390/0x390
[32031.768265]  kthread+0x10d/0x130
[32031.768521]  ? kthread_flush_work_fn+0x10/0x10
[32031.768778]  ret_from_fork+0x35/0x40

static int start_this_handle(...)
    BUG_ON(journal->j_flags & JBD2_UNMOUNT); <---- Trigger this

Besides, after we enable fast commit, ext4_fc_replay can add work to
s_error_work but return success, so the latter journal destroy in
ext4_load_journal can trigger this problem too.

Fix this problem with two steps:
1. Call ext4_commit_super directly in ext4_handle_error for the case
   that called from ext4_fc_replay
2. Since it's hard to pair the init and flush for s_error_work, we'd
   better add a extras flush_work before journal destroy in
   ext4_fill_super

Besides, this patch will call ext4_commit_super in ext4_handle_error for
any nojournal case too. But it seems safe since the reason we call
schedule_work was that we should save error info to sb through journal
if available. Conversely, for the nojournal case, it seems useless delay
commit superblock to s_error_work.

Fixes: c92dc856848f ("ext4: defer saving error info from atomic context")
Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available")
Cc: stable@kernel.org
Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20210924093917.1953239-1-yangerkun@huawei.com
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dd0846321c2f..16e228285967 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -661,7 +661,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
 		 * constraints, it may not be safe to do it right here so we
 		 * defer superblock flushing to a workqueue.
 		 */
-		if (continue_fs)
+		if (continue_fs && journal)
 			schedule_work(&EXT4_SB(sb)->s_error_work);
 		else
 			ext4_commit_super(sb);
@@ -5143,12 +5143,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_ea_block_cache = NULL;
 
 	if (sbi->s_journal) {
+		/* flush s_error_work before journal destroy. */
+		flush_work(&sbi->s_error_work);
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
 failed_mount3a:
 	ext4_es_unregister_shrinker(sbi);
 failed_mount3:
+	/* flush s_error_work before sbi destroy */
 	flush_work(&sbi->s_error_work);
 	del_timer_sync(&sbi->s_err_report);
 	ext4_stop_mmpd(sbi);
-- 
Gitee


From d6ccfdbf351b28338ae9df32649030643d2110cf Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:35:45 +0800
Subject: [PATCH 026/134] block: don't call rq_qos_ops->done_bio if the bio
 isn't tracked

mainline inclusion
from mainline-v5.15-rc3
commit a647a524a46736786c95cdb553a070322ca096e3
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a647a524a46736786c95cdb553a070322ca096e3

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

rq_qos framework is only applied on request based driver, so:

1) rq_qos_done_bio() needn't to be called for bio based driver

2) rq_qos_done_bio() needn't to be called for bio which isn't tracked,
such as bios ended from error handling code.

Especially in bio_endio():

1) request queue is referred via bio->bi_bdev->bd_disk->queue, which
may be gone since request queue refcount may not be held in above two
cases

2) q->rq_qos may be freed in blk_cleanup_queue() when calling into
__rq_qos_done_bio()

Fix the potential kernel panic by not calling rq_qos_ops->done_bio if
the bio isn't tracked. This way is safe because both ioc_rqos_done_bio()
and blkcg_iolatency_done_bio() are nop if the bio isn't tracked.

Reported-by: Yu Kuai <yukuai3@huawei.com>
Cc: tj@kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20210924110704.1541818-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflict: block/bio.c
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 0703a208ca24..b2e0f0c94c5f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1433,7 +1433,7 @@ void bio_endio(struct bio *bio)
 	if (!bio_integrity_endio(bio))
 		return;
 
-	if (bio->bi_disk)
+	if (bio->bi_disk && bio_flagged(bio, BIO_TRACKED))
 		rq_qos_done_bio(bio->bi_disk->queue, bio);
 
 	/*
-- 
Gitee


From 2dc2ee15e2b4a77c8717a24ebbbe284f9e24a44f Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin@huawei.com>
Date: Mon, 15 Nov 2021 19:35:47 +0800
Subject: [PATCH 027/134] block: fix UAF from race of ioc_release_fn() and
 __ioc_clear_queue()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------

KASAN reports a use-after-free report when doing block test:

[293762.535116]

==================================================================
[293762.535129] BUG: KASAN: use-after-free in
queued_spin_lock_slowpath+0x78/0x4c8
[293762.535133] Write of size 2 at addr ffff8000d5f12bc8 by task sh/9148
[293762.535135]
[293762.535145] CPU: 1 PID: 9148 Comm: sh Kdump: loaded Tainted: G W
4.19.90-vhulk2108.6.0.h824.kasan.eulerosv2r10.aarch64 #1
[293762.535148] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0
02/06/2015
[293762.535150] Call trace:
[293762.535154] dump_backtrace+0x0/0x310
[293762.535158] show_stack+0x28/0x38
[293762.535165] dump_stack+0xec/0x15c
[293762.535172] print_address_description+0x68/0x2d0
[293762.535177] kasan_report+0x130/0x2f0
[293762.535182] __asan_store2+0x80/0xa8
[293762.535189] queued_spin_lock_slowpath+0x78/0x4c8
[293762.535194] __ioc_clear_queue+0x158/0x160
[293762.535198] ioc_clear_queue+0x194/0x258
[293762.535202] elevator_switch_mq+0x64/0x170
[293762.535206] elevator_switch+0x140/0x270
[293762.535211] elv_iosched_store+0x1a4/0x2a0
[293762.535215] queue_attr_store+0x90/0xe0
[293762.535219] sysfs_kf_write+0xa8/0xe8
[293762.535222] kernfs_fop_write+0x1f8/0x378
[293762.535227] __vfs_write+0xe0/0x360
[293762.535233] vfs_write+0xf0/0x270
[293762.535237] ksys_write+0xdc/0x1b8
[293762.535241] __arm64_sys_write+0x50/0x60
[293762.535245] el0_svc_common+0xc8/0x320
[293762.535250] el0_svc_handler+0xf8/0x160
[293762.535253] el0_svc+0x10/0x218
[293762.535254]
[293762.535258] Allocated by task 3466763:
[293762.535264] kasan_kmalloc+0xe0/0x190
[293762.535269] kasan_slab_alloc+0x14/0x20
[293762.535276] kmem_cache_alloc_node+0x1b4/0x420
[293762.535280] create_task_io_context+0x40/0x210
[293762.535284] generic_make_request_checks+0xc78/0xe38
[293762.535288] generic_make_request+0xf8/0x640
[293762.535394] generic_file_direct_write+0x100/0x268
[293762.535401] __generic_file_write_iter+0x128/0x370
[293762.535467] vfs_iter_write+0x64/0x90
[293762.535489] ovl_write_iter+0x2f8/0x458 [overlay]
[293762.535493] __vfs_write+0x264/0x360
[293762.535497] vfs_write+0xf0/0x270
[293762.535501] ksys_write+0xdc/0x1b8
[293762.535505] __arm64_sys_write+0x50/0x60
[293762.535509] el0_svc_common+0xc8/0x320
[293762.535387] ext4_direct_IO+0x3c8/0xe80 [ext4]
[293762.535394] generic_file_direct_write+0x100/0x268
[293762.535401] __generic_file_write_iter+0x128/0x370
[293762.535452] ext4_file_write_iter+0x610/0xa80 [ext4]
[293762.535457] do_iter_readv_writev+0x28c/0x390
[293762.535463] do_iter_write+0xfc/0x360
[293762.535467] vfs_iter_write+0x64/0x90
[293762.535489] ovl_write_iter+0x2f8/0x458 [overlay]
[293762.535493] __vfs_write+0x264/0x360
[293762.535497] vfs_write+0xf0/0x270
[293762.535501] ksys_write+0xdc/0x1b8
[293762.535505] __arm64_sys_write+0x50/0x60
[293762.535509] el0_svc_common+0xc8/0x320
[293762.535513] el0_svc_handler+0xf8/0x160
[293762.535517] el0_svc+0x10/0x218
[293762.535521]
[293762.535523] Freed by task 3466763:
[293762.535528] __kasan_slab_free+0x120/0x228
[293762.535532] kasan_slab_free+0x10/0x18
[293762.535536] kmem_cache_free+0x68/0x248
[293762.535540] put_io_context+0x104/0x190
[293762.535545] put_io_context_active+0x204/0x2c8
[293762.535549] exit_io_context+0x74/0xa0
[293762.535553] do_exit+0x658/0xae0
[293762.535557] do_group_exit+0x74/0x1a8
[293762.535561] get_signal+0x21c/0xf38
[293762.535564] do_signal+0x10c/0x450
[293762.535568] do_notify_resume+0x224/0x4b0
[293762.535573] work_pending+0x8/0x10
[293762.535574]
[293762.535578] The buggy address belongs to the object at
ffff8000d5f12bb8
which belongs to the cache blkdev_ioc of size 136
[293762.535582] The buggy address is located 16 bytes inside of
136-byte region [ffff8000d5f12bb8, ffff8000d5f12c40)
[293762.535583] The buggy address belongs to the page:
[293762.535588] page:ffff7e000357c480 count:1 mapcount:0
mapping:ffff8000d8563c00 index:0x0
[293762.536201] flags: 0x7ffff0000000100(slab)
[293762.536540] raw: 07ffff0000000100 ffff7e0003118588 ffff8000d8adb530
ffff8000d8563c00
[293762.536546] raw: 0000000000000000 0000000000140014 00000001ffffffff
0000000000000000
[293762.536551] page dumped because: kasan: bad access detected
[293762.536552]
[293762.536554] Memory state around the buggy address:
[293762.536558] ffff8000d5f12a80: 00 00 00 00 00 00 fc fc fc fc fc fc fc
fc fb fb
[293762.536562] ffff8000d5f12b00: fb fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fc
[293762.536566] >ffff8000d5f12b80: fc fc fc fc fc fc fc fb fb fb fb fb
fb fb fb fb
[293762.536568] ^
[293762.536572] ffff8000d5f12c00: fb fb fb fb fb fb fb fb fc fc fc fc fc
fc fc fc
[293762.536576] ffff8000d5f12c80: 00 00 00 00 00 00 00 00 00 00 00 00 00
00 00 00

[293762.536577]
==================================================================

ioc_release_fn() will destroy icq from ioc->icq_list and
__ioc_clear_queue() will destroy icq from request_queue->icq_list.
However, the ioc_release_fn() will hold ioc_lock firstly, and
free ioc finally. Then __ioc_clear_queue() will get ioc from icq
and hold ioc_lock, but ioc has been released, which will result
in a use-after-free.

CPU0                                    CPU1
put_io_context                          elevator_switch_mq
queue_work &ioc->release_work            ioc_clear_queue
                                           ^^^ splice q->icq_list
                                         __ioc_clear_queue
                                           ^^^get icq from icq_list
                                              get ioc from icq->ioc
  ioc_release_fn
   spin_lock(ioc->lock)
   ioc_destroy_icq(icq)
   spin_unlock(ioc->lock)
   free(ioc)
                                         spin_lock(ioc->lock) <= UAF

Fix by grabbing the request_queue->queue_lock in ioc_clear_queue() to
avoid this race scene.

Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Link: https://lore.kernel.org/lkml/1c9ad9f2-c487-c793-1ffc-5c3ec0fcc0ae@kernel.dk/
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-ioc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 57299f860d41..1d6ba8ff5a66 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -242,9 +242,9 @@ void ioc_clear_queue(struct request_queue *q)
 
 	spin_lock_irq(&q->queue_lock);
 	list_splice_init(&q->icq_list, &icq_list);
-	spin_unlock_irq(&q->queue_lock);
 
 	__ioc_clear_queue(&icq_list);
+	spin_unlock_irq(&q->queue_lock);
 }
 
 int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
-- 
Gitee


From c516b426c31a1d50dddf4b3a6e1c924734897672 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Mon, 15 Nov 2021 19:35:48 +0800
Subject: [PATCH 028/134] nbd: add the check to prevent overflow in
 __nbd_ioctl()

mainline inclusion
from mainline-5.14
commit fad7cd3310db3099f95dd34312c77740fbc455e5
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fad7cd3310db3099f95dd34312c77740fbc455e5

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

If user specify a large enough value of NBD blocks option, it may trigger
signed integer overflow which may lead to nbd->config->bytesize becomes a
large or small value, zero in particular.

UBSAN: Undefined behaviour in drivers/block/nbd.c:325:31
signed integer overflow:
1024 * 4611686155866341414 cannot be represented in type 'long long int'
[...]
Call trace:
[...]
 handle_overflow+0x188/0x1dc lib/ubsan.c:192
 __ubsan_handle_mul_overflow+0x34/0x44 lib/ubsan.c:213
 nbd_size_set drivers/block/nbd.c:325 [inline]
 __nbd_ioctl drivers/block/nbd.c:1342 [inline]
 nbd_ioctl+0x998/0xa10 drivers/block/nbd.c:1395
 __blkdev_driver_ioctl block/ioctl.c:311 [inline]
[...]

Although it is not a big deal, still silence the UBSAN by limit
the input value.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210804021212.990223-1-libaokun1@huawei.com
[axboe: dropped unlikely()]
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflicts:
	drivers/block/nbd.c

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f7f1d9dbdc80..f4b0637bb155 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1385,6 +1385,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		       unsigned int cmd, unsigned long arg)
 {
 	struct nbd_config *config = nbd->config;
+	loff_t bytesize;
 
 	switch (cmd) {
 	case NBD_DISCONNECT:
@@ -1407,6 +1408,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 			     div_s64(arg, config->blksize));
 		return 0;
 	case NBD_SET_SIZE_BLOCKS:
+		if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize))
+			return -EINVAL;
 		nbd_size_set(nbd, config->blksize, arg);
 		return 0;
 	case NBD_SET_TIMEOUT:
-- 
Gitee


From db88ad78a216ba7092b1ae3584fb28dbbb16b1da Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Mon, 15 Nov 2021 19:35:49 +0800
Subject: [PATCH 029/134] sched/topology: Warn when NUMA diameter > 2

mainline inclusion
from mainline-v5.11-rc1
commit b5b217346de85ed1b03fdecd5c5076b34fbb2f0b
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b5b217346de85ed1b03fdecd5c5076b34fbb2f0b

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
----------------------------------------------------------

NUMA topologies where the shortest path between some two nodes requires
three or more hops (i.e. diameter > 2) end up being misrepresented in the
scheduler topology structures.

This is currently detected when booting a kernel with CONFIG_SCHED_DEBUG=y
+ sched_debug on the cmdline, although this will only yield a warning about
sched_group spans not matching sched_domain spans:

  ERROR: groups don't span domain->span

Add an explicit warning for that case, triggered regardless of
CONFIG_SCHED_DEBUG, and decorate it with an appropriate comment.

The topology described in the comment can be booted up on QEMU by appending
the following to your usual QEMU incantation:

    -smp cores=4 \
    -numa node,cpus=0,nodeid=0 -numa node,cpus=1,nodeid=1, \
    -numa node,cpus=2,nodeid=2, -numa node,cpus=3,nodeid=3, \
    -numa dist,src=0,dst=1,val=20, -numa dist,src=0,dst=2,val=30, \
    -numa dist,src=0,dst=3,val=40, -numa dist,src=1,dst=2,val=20, \
    -numa dist,src=1,dst=3,val=30, -numa dist,src=2,dst=3,val=20

A somewhat more realistic topology (6-node mesh) with the same affliction
can be conjured with:

    -smp cores=6 \
    -numa node,cpus=0,nodeid=0 -numa node,cpus=1,nodeid=1, \
    -numa node,cpus=2,nodeid=2, -numa node,cpus=3,nodeid=3, \
    -numa node,cpus=4,nodeid=4, -numa node,cpus=5,nodeid=5, \
    -numa dist,src=0,dst=1,val=20, -numa dist,src=0,dst=2,val=30, \
    -numa dist,src=0,dst=3,val=40, -numa dist,src=0,dst=4,val=30, \
    -numa dist,src=0,dst=5,val=20, \
    -numa dist,src=1,dst=2,val=20, -numa dist,src=1,dst=3,val=30, \
    -numa dist,src=1,dst=4,val=20, -numa dist,src=1,dst=5,val=30, \
    -numa dist,src=2,dst=3,val=20, -numa dist,src=2,dst=4,val=30, \
    -numa dist,src=2,dst=5,val=40, \
    -numa dist,src=3,dst=4,val=20, -numa dist,src=3,dst=5,val=30, \
    -numa dist,src=4,dst=5,val=20

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Link: https://lore.kernel.org/lkml/jhjtux5edo2.mognet@arm.com
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index dd7770226086..796f6c62e59a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -674,6 +674,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
+	int numa_distance = 0;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
@@ -705,6 +706,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 			sd->child = NULL;
 	}
 
+	for (tmp = sd; tmp; tmp = tmp->parent)
+		numa_distance += !!(tmp->flags & SD_NUMA);
+
+	/*
+	 * FIXME: Diameter >=3 is misrepresented.
+	 *
+	 * Smallest diameter=3 topology is:
+	 *
+	 *   node   0   1   2   3
+	 *     0:  10  20  30  40
+	 *     1:  20  10  20  30
+	 *     2:  30  20  10  20
+	 *     3:  40  30  20  10
+	 *
+	 *   0 --- 1 --- 2 --- 3
+	 *
+	 * NUMA-3	0-3		N/A		N/A		0-3
+	 *  groups:	{0-2},{1-3}					{1-3},{0-2}
+	 *
+	 * NUMA-2	0-2		0-3		0-3		1-3
+	 *  groups:	{0-1},{1-3}	{0-2},{2-3}	{1-3},{0-1}	{2-3},{0-2}
+	 *
+	 * NUMA-1	0-1		0-2		1-3		2-3
+	 *  groups:	{0},{1}		{1},{2},{0}	{2},{3},{1}	{3},{2}
+	 *
+	 * NUMA-0	0		1		2		3
+	 *
+	 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
+	 * group span isn't a subset of the domain span.
+	 */
+	WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
+
 	sched_domain_debug(sd, cpu);
 
 	rq_attach_root(rq, rd);
-- 
Gitee


From 55aaafa474e541bd879834cd2128d73a99d5f1ba Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Mon, 15 Nov 2021 19:35:50 +0800
Subject: [PATCH 030/134] sched/topology: fix the issue groups don't span
 domain->span for NUMA diameter > 2

mainline inclusion
from mainline-v5.13-rc1
commit 585b6d2723dc927ebc4ad884c4e879e4da8bc21f
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=585b6d2723dc927ebc4ad884c4e879e4da8bc21f

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
----------------------------------------------------------

As long as NUMA diameter > 2, building sched_domain by sibling's child
domain will definitely create a sched_domain with sched_group which will
span out of the sched_domain:

               +------+         +------+        +-------+       +------+
               | node |  12     |node  | 20     | node  |  12   |node  |
               |  0   +---------+1     +--------+ 2     +-------+3     |
               +------+         +------+        +-------+       +------+

domain0        node0            node1            node2          node3

domain1        node0+1          node0+1          node2+3        node2+3
                                                 +
domain2        node0+1+2                         |
             group: node0+1                      |
               group:node2+3 <-------------------+

when node2 is added into the domain2 of node0, kernel is using the child
domain of node2's domain2, which is domain1(node2+3). Node 3 is outside
the span of the domain including node0+1+2.

This will make load_balance() run based on screwed avg_load and group_type
in the sched_group spanning out of the sched_domain, and it also makes
select_task_rq_fair() pick an idle CPU outside the sched_domain.

Real servers which suffer from this problem include Kunpeng920 and 8-node
Sun Fire X4600-M2, at least.

Here we move to use the *child* domain of the *child* domain of node2's
domain2 as the new added sched_group. At the same, we re-use the lower
level sgc directly.
               +------+         +------+        +-------+       +------+
               | node |  12     |node  | 20     | node  |  12   |node  |
               |  0   +---------+1     +--------+ 2     +-------+3     |
               +------+         +------+        +-------+       +------+

domain0        node0            node1          +- node2          node3
                                               |
domain1        node0+1          node0+1        | node2+3        node2+3
                                               |
domain2        node0+1+2                       |
             group: node0+1                    |
               group:node2 <-------------------+

While the lower level sgc is re-used, this patch only changes the remote
sched_groups for those sched_domains playing grandchild trick, therefore,
sgc->next_update is still safe since it's only touched by CPUs that have
the group span as local group. And sgc->imbalance is also safe because
sd_parent remains the same in load_balance and LB only tries other CPUs
from the local group.
Moreover, since local groups are not touched, they are still getting
roughly equal size in a TL. And should_we_balance() only matters with
local groups, so the pull probability of those groups are still roughly
equal.

Tested by the below topology:
qemu-system-aarch64  -M virt -nographic \
 -smp cpus=8 \
 -numa node,cpus=0-1,nodeid=0 \
 -numa node,cpus=2-3,nodeid=1 \
 -numa node,cpus=4-5,nodeid=2 \
 -numa node,cpus=6-7,nodeid=3 \
 -numa dist,src=0,dst=1,val=12 \
 -numa dist,src=0,dst=2,val=20 \
 -numa dist,src=0,dst=3,val=22 \
 -numa dist,src=1,dst=2,val=22 \
 -numa dist,src=2,dst=3,val=12 \
 -numa dist,src=1,dst=3,val=24 \
 -m 4G -cpu cortex-a57 -kernel arch/arm64/boot/Image

w/o patch, we get lots of "groups don't span domain->span":
[    0.802139] CPU0 attaching sched-domain(s):
[    0.802193]  domain-0: span=0-1 level=MC
[    0.802443]   groups: 0:{ span=0 cap=1013 }, 1:{ span=1 cap=979 }
[    0.802693]   domain-1: span=0-3 level=NUMA
[    0.802731]    groups: 0:{ span=0-1 cap=1992 }, 2:{ span=2-3 cap=1943 }
[    0.802811]    domain-2: span=0-5 level=NUMA
[    0.802829]     groups: 0:{ span=0-3 cap=3935 }, 4:{ span=4-7 cap=3937 }
[    0.802881] ERROR: groups don't span domain->span
[    0.803058]     domain-3: span=0-7 level=NUMA
[    0.803080]      groups: 0:{ span=0-5 mask=0-1 cap=5843 }, 6:{ span=4-7 mask=6-7 cap=4077 }
[    0.804055] CPU1 attaching sched-domain(s):
[    0.804072]  domain-0: span=0-1 level=MC
[    0.804096]   groups: 1:{ span=1 cap=979 }, 0:{ span=0 cap=1013 }
[    0.804152]   domain-1: span=0-3 level=NUMA
[    0.804170]    groups: 0:{ span=0-1 cap=1992 }, 2:{ span=2-3 cap=1943 }
[    0.804219]    domain-2: span=0-5 level=NUMA
[    0.804236]     groups: 0:{ span=0-3 cap=3935 }, 4:{ span=4-7 cap=3937 }
[    0.804302] ERROR: groups don't span domain->span
[    0.804520]     domain-3: span=0-7 level=NUMA
[    0.804546]      groups: 0:{ span=0-5 mask=0-1 cap=5843 }, 6:{ span=4-7 mask=6-7 cap=4077 }
[    0.804677] CPU2 attaching sched-domain(s):
[    0.804687]  domain-0: span=2-3 level=MC
[    0.804705]   groups: 2:{ span=2 cap=934 }, 3:{ span=3 cap=1009 }
[    0.804754]   domain-1: span=0-3 level=NUMA
[    0.804772]    groups: 2:{ span=2-3 cap=1943 }, 0:{ span=0-1 cap=1992 }
[    0.804820]    domain-2: span=0-5 level=NUMA
[    0.804836]     groups: 2:{ span=0-3 mask=2-3 cap=3991 }, 4:{ span=0-1,4-7 mask=4-5 cap=5985 }
[    0.804944] ERROR: groups don't span domain->span
[    0.805108]     domain-3: span=0-7 level=NUMA
[    0.805134]      groups: 2:{ span=0-5 mask=2-3 cap=5899 }, 6:{ span=0-1,4-7 mask=6-7 cap=6125 }
[    0.805223] CPU3 attaching sched-domain(s):
[    0.805232]  domain-0: span=2-3 level=MC
[    0.805249]   groups: 3:{ span=3 cap=1009 }, 2:{ span=2 cap=934 }
[    0.805319]   domain-1: span=0-3 level=NUMA
[    0.805336]    groups: 2:{ span=2-3 cap=1943 }, 0:{ span=0-1 cap=1992 }
[    0.805383]    domain-2: span=0-5 level=NUMA
[    0.805399]     groups: 2:{ span=0-3 mask=2-3 cap=3991 }, 4:{ span=0-1,4-7 mask=4-5 cap=5985 }
[    0.805458] ERROR: groups don't span domain->span
[    0.805605]     domain-3: span=0-7 level=NUMA
[    0.805626]      groups: 2:{ span=0-5 mask=2-3 cap=5899 }, 6:{ span=0-1,4-7 mask=6-7 cap=6125 }
[    0.805712] CPU4 attaching sched-domain(s):
[    0.805721]  domain-0: span=4-5 level=MC
[    0.805738]   groups: 4:{ span=4 cap=984 }, 5:{ span=5 cap=924 }
[    0.805787]   domain-1: span=4-7 level=NUMA
[    0.805803]    groups: 4:{ span=4-5 cap=1908 }, 6:{ span=6-7 cap=2029 }
[    0.805851]    domain-2: span=0-1,4-7 level=NUMA
[    0.805867]     groups: 4:{ span=4-7 cap=3937 }, 0:{ span=0-3 cap=3935 }
[    0.805915] ERROR: groups don't span domain->span
[    0.806108]     domain-3: span=0-7 level=NUMA
[    0.806130]      groups: 4:{ span=0-1,4-7 mask=4-5 cap=5985 }, 2:{ span=0-3 mask=2-3 cap=3991 }
[    0.806214] CPU5 attaching sched-domain(s):
[    0.806222]  domain-0: span=4-5 level=MC
[    0.806240]   groups: 5:{ span=5 cap=924 }, 4:{ span=4 cap=984 }
[    0.806841]   domain-1: span=4-7 level=NUMA
[    0.806866]    groups: 4:{ span=4-5 cap=1908 }, 6:{ span=6-7 cap=2029 }
[    0.806934]    domain-2: span=0-1,4-7 level=NUMA
[    0.806953]     groups: 4:{ span=4-7 cap=3937 }, 0:{ span=0-3 cap=3935 }
[    0.807004] ERROR: groups don't span domain->span
[    0.807312]     domain-3: span=0-7 level=NUMA
[    0.807386]      groups: 4:{ span=0-1,4-7 mask=4-5 cap=5985 }, 2:{ span=0-3 mask=2-3 cap=3991 }
[    0.807686] CPU6 attaching sched-domain(s):
[    0.807710]  domain-0: span=6-7 level=MC
[    0.807750]   groups: 6:{ span=6 cap=1017 }, 7:{ span=7 cap=1012 }
[    0.807840]   domain-1: span=4-7 level=NUMA
[    0.807870]    groups: 6:{ span=6-7 cap=2029 }, 4:{ span=4-5 cap=1908 }
[    0.807952]    domain-2: span=0-1,4-7 level=NUMA
[    0.807985]     groups: 6:{ span=4-7 mask=6-7 cap=4077 }, 0:{ span=0-5 mask=0-1 cap=5843 }
[    0.808045] ERROR: groups don't span domain->span
[    0.808257]     domain-3: span=0-7 level=NUMA
[    0.808571]      groups: 6:{ span=0-1,4-7 mask=6-7 cap=6125 }, 2:{ span=0-5 mask=2-3 cap=5899 }
[    0.808848] CPU7 attaching sched-domain(s):
[    0.808860]  domain-0: span=6-7 level=MC
[    0.808880]   groups: 7:{ span=7 cap=1012 }, 6:{ span=6 cap=1017 }
[    0.808953]   domain-1: span=4-7 level=NUMA
[    0.808974]    groups: 6:{ span=6-7 cap=2029 }, 4:{ span=4-5 cap=1908 }
[    0.809034]    domain-2: span=0-1,4-7 level=NUMA
[    0.809055]     groups: 6:{ span=4-7 mask=6-7 cap=4077 }, 0:{ span=0-5 mask=0-1 cap=5843 }
[    0.809128] ERROR: groups don't span domain->span
[    0.810361]     domain-3: span=0-7 level=NUMA
[    0.810400]      groups: 6:{ span=0-1,4-7 mask=6-7 cap=5961 }, 2:{ span=0-5 mask=2-3 cap=5903 }

w/ patch, we don't get "groups don't span domain->span" any more:
[    1.486271] CPU0 attaching sched-domain(s):
[    1.486820]  domain-0: span=0-1 level=MC
[    1.500924]   groups: 0:{ span=0 cap=980 }, 1:{ span=1 cap=994 }
[    1.515717]   domain-1: span=0-3 level=NUMA
[    1.515903]    groups: 0:{ span=0-1 cap=1974 }, 2:{ span=2-3 cap=1989 }
[    1.516989]    domain-2: span=0-5 level=NUMA
[    1.517124]     groups: 0:{ span=0-3 cap=3963 }, 4:{ span=4-5 cap=1949 }
[    1.517369]     domain-3: span=0-7 level=NUMA
[    1.517423]      groups: 0:{ span=0-5 mask=0-1 cap=5912 }, 6:{ span=4-7 mask=6-7 cap=4054 }
[    1.520027] CPU1 attaching sched-domain(s):
[    1.520097]  domain-0: span=0-1 level=MC
[    1.520184]   groups: 1:{ span=1 cap=994 }, 0:{ span=0 cap=980 }
[    1.520429]   domain-1: span=0-3 level=NUMA
[    1.520487]    groups: 0:{ span=0-1 cap=1974 }, 2:{ span=2-3 cap=1989 }
[    1.520687]    domain-2: span=0-5 level=NUMA
[    1.520744]     groups: 0:{ span=0-3 cap=3963 }, 4:{ span=4-5 cap=1949 }
[    1.520948]     domain-3: span=0-7 level=NUMA
[    1.521038]      groups: 0:{ span=0-5 mask=0-1 cap=5912 }, 6:{ span=4-7 mask=6-7 cap=4054 }
[    1.522068] CPU2 attaching sched-domain(s):
[    1.522348]  domain-0: span=2-3 level=MC
[    1.522606]   groups: 2:{ span=2 cap=1003 }, 3:{ span=3 cap=986 }
[    1.522832]   domain-1: span=0-3 level=NUMA
[    1.522885]    groups: 2:{ span=2-3 cap=1989 }, 0:{ span=0-1 cap=1974 }
[    1.523043]    domain-2: span=0-5 level=NUMA
[    1.523092]     groups: 2:{ span=0-3 mask=2-3 cap=4037 }, 4:{ span=4-5 cap=1949 }
[    1.523302]     domain-3: span=0-7 level=NUMA
[    1.523352]      groups: 2:{ span=0-5 mask=2-3 cap=5986 }, 6:{ span=0-1,4-7 mask=6-7 cap=6102 }
[    1.523748] CPU3 attaching sched-domain(s):
[    1.523774]  domain-0: span=2-3 level=MC
[    1.523825]   groups: 3:{ span=3 cap=986 }, 2:{ span=2 cap=1003 }
[    1.524009]   domain-1: span=0-3 level=NUMA
[    1.524086]    groups: 2:{ span=2-3 cap=1989 }, 0:{ span=0-1 cap=1974 }
[    1.524281]    domain-2: span=0-5 level=NUMA
[    1.524331]     groups: 2:{ span=0-3 mask=2-3 cap=4037 }, 4:{ span=4-5 cap=1949 }
[    1.524534]     domain-3: span=0-7 level=NUMA
[    1.524586]      groups: 2:{ span=0-5 mask=2-3 cap=5986 }, 6:{ span=0-1,4-7 mask=6-7 cap=6102 }
[    1.524847] CPU4 attaching sched-domain(s):
[    1.524873]  domain-0: span=4-5 level=MC
[    1.524954]   groups: 4:{ span=4 cap=958 }, 5:{ span=5 cap=991 }
[    1.525105]   domain-1: span=4-7 level=NUMA
[    1.525153]    groups: 4:{ span=4-5 cap=1949 }, 6:{ span=6-7 cap=2006 }
[    1.525368]    domain-2: span=0-1,4-7 level=NUMA
[    1.525428]     groups: 4:{ span=4-7 cap=3955 }, 0:{ span=0-1 cap=1974 }
[    1.532726]     domain-3: span=0-7 level=NUMA
[    1.532811]      groups: 4:{ span=0-1,4-7 mask=4-5 cap=6003 }, 2:{ span=0-3 mask=2-3 cap=4037 }
[    1.534125] CPU5 attaching sched-domain(s):
[    1.534159]  domain-0: span=4-5 level=MC
[    1.534303]   groups: 5:{ span=5 cap=991 }, 4:{ span=4 cap=958 }
[    1.534490]   domain-1: span=4-7 level=NUMA
[    1.534572]    groups: 4:{ span=4-5 cap=1949 }, 6:{ span=6-7 cap=2006 }
[    1.534734]    domain-2: span=0-1,4-7 level=NUMA
[    1.534783]     groups: 4:{ span=4-7 cap=3955 }, 0:{ span=0-1 cap=1974 }
[    1.536057]     domain-3: span=0-7 level=NUMA
[    1.536430]      groups: 4:{ span=0-1,4-7 mask=4-5 cap=6003 }, 2:{ span=0-3 mask=2-3 cap=3896 }
[    1.536815] CPU6 attaching sched-domain(s):
[    1.536846]  domain-0: span=6-7 level=MC
[    1.536934]   groups: 6:{ span=6 cap=1005 }, 7:{ span=7 cap=1001 }
[    1.537144]   domain-1: span=4-7 level=NUMA
[    1.537262]    groups: 6:{ span=6-7 cap=2006 }, 4:{ span=4-5 cap=1949 }
[    1.537553]    domain-2: span=0-1,4-7 level=NUMA
[    1.537613]     groups: 6:{ span=4-7 mask=6-7 cap=4054 }, 0:{ span=0-1 cap=1805 }
[    1.537872]     domain-3: span=0-7 level=NUMA
[    1.537998]      groups: 6:{ span=0-1,4-7 mask=6-7 cap=6102 }, 2:{ span=0-5 mask=2-3 cap=5845 }
[    1.538448] CPU7 attaching sched-domain(s):
[    1.538505]  domain-0: span=6-7 level=MC
[    1.538586]   groups: 7:{ span=7 cap=1001 }, 6:{ span=6 cap=1005 }
[    1.538746]   domain-1: span=4-7 level=NUMA
[    1.538798]    groups: 6:{ span=6-7 cap=2006 }, 4:{ span=4-5 cap=1949 }
[    1.539048]    domain-2: span=0-1,4-7 level=NUMA
[    1.539111]     groups: 6:{ span=4-7 mask=6-7 cap=4054 }, 0:{ span=0-1 cap=1805 }
[    1.539571]     domain-3: span=0-7 level=NUMA
[    1.539610]      groups: 6:{ span=0-1,4-7 mask=6-7 cap=6102 }, 2:{ span=0-5 mask=2-3 cap=5845 }

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Tested-by: Meelis Roos <mroos@linux.ee>
Link: https://lkml.kernel.org/r/20210224030944.15232-1-song.bao.hua@hisilicon.com
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/sched/topology.c | 91 +++++++++++++++++++++++++++--------------
 1 file changed, 61 insertions(+), 30 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 796f6c62e59a..afe95ede9861 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -709,35 +709,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	for (tmp = sd; tmp; tmp = tmp->parent)
 		numa_distance += !!(tmp->flags & SD_NUMA);
 
-	/*
-	 * FIXME: Diameter >=3 is misrepresented.
-	 *
-	 * Smallest diameter=3 topology is:
-	 *
-	 *   node   0   1   2   3
-	 *     0:  10  20  30  40
-	 *     1:  20  10  20  30
-	 *     2:  30  20  10  20
-	 *     3:  40  30  20  10
-	 *
-	 *   0 --- 1 --- 2 --- 3
-	 *
-	 * NUMA-3	0-3		N/A		N/A		0-3
-	 *  groups:	{0-2},{1-3}					{1-3},{0-2}
-	 *
-	 * NUMA-2	0-2		0-3		0-3		1-3
-	 *  groups:	{0-1},{1-3}	{0-2},{2-3}	{1-3},{0-1}	{2-3},{0-2}
-	 *
-	 * NUMA-1	0-1		0-2		1-3		2-3
-	 *  groups:	{0},{1}		{1},{2},{0}	{2},{3},{1}	{3},{2}
-	 *
-	 * NUMA-0	0		1		2		3
-	 *
-	 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
-	 * group span isn't a subset of the domain span.
-	 */
-	WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
-
 	sched_domain_debug(sd, cpu);
 
 	rq_attach_root(rq, rd);
@@ -968,6 +939,31 @@ static void init_overlap_sched_group(struct sched_domain *sd,
 	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
 }
 
+static struct sched_domain *
+find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
+{
+	/*
+	 * The proper descendant would be the one whose child won't span out
+	 * of sd
+	 */
+	while (sibling->child &&
+	       !cpumask_subset(sched_domain_span(sibling->child),
+			       sched_domain_span(sd)))
+		sibling = sibling->child;
+
+	/*
+	 * As we are referencing sgc across different topology level, we need
+	 * to go down to skip those sched_domains which don't contribute to
+	 * scheduling because they will be degenerated in cpu_attach_domain
+	 */
+	while (sibling->child &&
+	       cpumask_equal(sched_domain_span(sibling->child),
+			     sched_domain_span(sibling)))
+		sibling = sibling->child;
+
+	return sibling;
+}
+
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -1001,6 +997,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 
+		/*
+		 * Usually we build sched_group by sibling's child sched_domain
+		 * But for machines whose NUMA diameter are 3 or above, we move
+		 * to build sched_group by sibling's proper descendant's child
+		 * domain because sibling's child sched_domain will span out of
+		 * the sched_domain being built as below.
+		 *
+		 * Smallest diameter=3 topology is:
+		 *
+		 *   node   0   1   2   3
+		 *     0:  10  20  30  40
+		 *     1:  20  10  20  30
+		 *     2:  30  20  10  20
+		 *     3:  40  30  20  10
+		 *
+		 *   0 --- 1 --- 2 --- 3
+		 *
+		 * NUMA-3       0-3             N/A             N/A             0-3
+		 *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
+		 *
+		 * NUMA-2       0-2             0-3             0-3             1-3
+		 *  groups:     {0-1},{1-3}     {0-2},{2-3}     {1-3},{0-1}     {2-3},{0-2}
+		 *
+		 * NUMA-1       0-1             0-2             1-3             2-3
+		 *  groups:     {0},{1}         {1},{2},{0}     {2},{3},{1}     {3},{2}
+		 *
+		 * NUMA-0       0               1               2               3
+		 *
+		 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
+		 * group span isn't a subset of the domain span.
+		 */
+		if (sibling->child &&
+		    !cpumask_subset(sched_domain_span(sibling->child), span))
+			sibling = find_descended_sibling(sd, sibling);
+
 		sg = build_group_from_child_sched_domain(sibling, cpu);
 		if (!sg)
 			goto fail;
@@ -1008,7 +1039,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		sg_span = sched_group_span(sg);
 		cpumask_or(covered, covered, sg_span);
 
-		init_overlap_sched_group(sd, sg);
+		init_overlap_sched_group(sibling, sg);
 
 		if (!first)
 			first = sg;
-- 
Gitee


From f0fb7c6695a9bd98aa6a08fd0269206472c0aa0d Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Mon, 15 Nov 2021 19:35:51 +0800
Subject: [PATCH 031/134] sched/topology: Make sched_init_numa() use a set for
 the deduplicating sort
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mainline inclusion
from mainline-v5.12-rc1
commit 620a6dc40754dc218f5b6389b5d335e9a107fd29
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=620a6dc40754dc218f5b6389b5d335e9a107fd29

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
----------------------------------------------------------

The deduplicating sort in sched_init_numa() assumes that the first line in
the distance table contains all unique values in the entire table. I've
been trying to pen what this exactly means for the topology, but it's not
straightforward. For instance, topology.c uses this example:

  node   0   1   2   3
    0:  10  20  20  30
    1:  20  10  20  20
    2:  20  20  10  20
    3:  30  20  20  10

  0 ----- 1
  |     / |
  |   /   |
  | /     |
  2 ----- 3

Which works out just fine. However, if we swap nodes 0 and 1:

  1 ----- 0
  |     / |
  |   /   |
  | /     |
  2 ----- 3

we get this distance table:

  node   0  1  2  3
    0:  10 20 20 20
    1:  20 10 20 30
    2:  20 20 10 20
    3:  20 30 20 10

Which breaks the deduplicating sort (non-representative first line). In
this case this would just be a renumbering exercise, but it so happens that
we can have a deduplicating sort that goes through the whole table in O(n²)
at the extra cost of a temporary memory allocation (i.e. any form of set).

The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following
this, implement the set as a 256-bits bitmap. Should this not be
satisfactory (i.e. we want to support 32-bit values), then we'll have to go
for some other sparse set implementation.

This has the added benefit of letting us allocate just the right amount of
memory for sched_domains_numa_distance[], rather than an arbitrary
(nr_node_ids + 1).

Note: DT binding equivalent (distance-map) decodes distances as 32-bit
values.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210122123943.1217-2-valentin.schneider@arm.com
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 include/linux/topology.h |  1 +
 kernel/sched/topology.c  | 98 +++++++++++++++++++---------------------
 2 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index ad03df1cc266..7634cd737061 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -48,6 +48,7 @@ int arch_update_cpu_topology(void);
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
 #define REMOTE_DISTANCE		20
+#define DISTANCE_BITS           8
 #ifndef node_distance
 #define node_distance(from,to)	((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index afe95ede9861..84cf39842efe 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1613,66 +1613,57 @@ static void init_numa_topology_type(void)
 	}
 }
 
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
 void sched_init_numa(void)
 {
-	int next_distance, curr_distance = node_distance(0, 0);
 	struct sched_domain_topology_level *tl;
-	int level = 0;
-	int i, j, k;
-
-	sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
-	if (!sched_domains_numa_distance)
-		return;
-
-	/* Includes NUMA identity node at level 0. */
-	sched_domains_numa_distance[level++] = curr_distance;
-	sched_domains_numa_levels = level;
+	unsigned long *distance_map;
+	int nr_levels = 0;
+	int i, j;
 
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
-	 *
-	 * Assumes node_distance(0,j) includes all distances in
-	 * node_distance(i,j) in order to avoid cubic time.
 	 */
-	next_distance = curr_distance;
+	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+	if (!distance_map)
+		return;
+
+	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
 	for (i = 0; i < nr_node_ids; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
-			for (k = 0; k < nr_node_ids; k++) {
-				int distance = node_distance(i, k);
-
-				if (distance > curr_distance &&
-				    (distance < next_distance ||
-				     next_distance == curr_distance))
-					next_distance = distance;
+			int distance = node_distance(i, j);
 
-				/*
-				 * While not a strong assumption it would be nice to know
-				 * about cases where if node A is connected to B, B is not
-				 * equally connected to A.
-				 */
-				if (sched_debug() && node_distance(k, i) != distance)
-					sched_numa_warn("Node-distance not symmetric");
-
-				if (sched_debug() && i && !find_numa_distance(distance))
-					sched_numa_warn("Node-0 not representative");
+			if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+				sched_numa_warn("Invalid distance value range");
+				return;
 			}
-			if (next_distance != curr_distance) {
-				sched_domains_numa_distance[level++] = next_distance;
-				sched_domains_numa_levels = level;
-				curr_distance = next_distance;
-			} else break;
+
+			bitmap_set(distance_map, distance, 1);
 		}
+	}
+	/*
+	 * We can now figure out how many unique distance values there are and
+	 * allocate memory accordingly.
+	 */
+	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
 
-		/*
-		 * In case of sched_debug() we verify the above assumption.
-		 */
-		if (!sched_debug())
-			break;
+	sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+	if (!sched_domains_numa_distance) {
+		bitmap_free(distance_map);
+		return;
+	}
+
+	for (i = 0, j = 0; i < nr_levels; i++, j++) {
+		j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+		sched_domains_numa_distance[i] = j;
 	}
 
+	bitmap_free(distance_map);
+
 	/*
-	 * 'level' contains the number of unique distances
+	 * 'nr_levels' contains the number of unique distances
 	 *
 	 * The sched_domains_numa_distance[] array includes the actual distance
 	 * numbers.
@@ -1681,15 +1672,15 @@ void sched_init_numa(void)
 	/*
 	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
 	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
-	 * the array will contain less then 'level' members. This could be
+	 * the array will contain less then 'nr_levels' members. This could be
 	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
 	 * in other functions.
 	 *
-	 * We reset it to 'level' at the end of this function.
+	 * We reset it to 'nr_levels' at the end of this function.
 	 */
 	sched_domains_numa_levels = 0;
 
-	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+	sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
 	if (!sched_domains_numa_masks)
 		return;
 
@@ -1697,7 +1688,7 @@ void sched_init_numa(void)
 	 * Now for each level, construct a mask per node which contains all
 	 * CPUs of nodes that are that many hops away from us.
 	 */
-	for (i = 0; i < level; i++) {
+	for (i = 0; i < nr_levels; i++) {
 		sched_domains_numa_masks[i] =
 			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
 		if (!sched_domains_numa_masks[i])
@@ -1705,12 +1696,17 @@ void sched_init_numa(void)
 
 		for (j = 0; j < nr_node_ids; j++) {
 			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+			int k;
+
 			if (!mask)
 				return;
 
 			sched_domains_numa_masks[i][j] = mask;
 
 			for_each_node(k) {
+				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+					sched_numa_warn("Node-distance not symmetric");
+
 				if (node_distance(j, k) > sched_domains_numa_distance[i])
 					continue;
 
@@ -1722,7 +1718,7 @@ void sched_init_numa(void)
 	/* Compute default topology size */
 	for (i = 0; sched_domain_topology[i].mask; i++);
 
-	tl = kzalloc((i + level + 1) *
+	tl = kzalloc((i + nr_levels) *
 			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
 	if (!tl)
 		return;
@@ -1745,7 +1741,7 @@ void sched_init_numa(void)
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
-	for (j = 1; j < level; i++, j++) {
+	for (j = 1; j < nr_levels; i++, j++) {
 		tl[i] = (struct sched_domain_topology_level){
 			.mask = sd_numa_mask,
 			.sd_flags = cpu_numa_flags,
@@ -1757,8 +1753,8 @@ void sched_init_numa(void)
 
 	sched_domain_topology = tl;
 
-	sched_domains_numa_levels = level;
-	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+	sched_domains_numa_levels = nr_levels;
+	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
 
 	init_numa_topology_type();
 }
-- 
Gitee


From e29a80f77a3fbcb052939eae904ed48ca0a786e7 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Mon, 15 Nov 2021 19:35:52 +0800
Subject: [PATCH 032/134] ia64: ensure proper NUMA distance and possible map
 initialization

mainline inclusion
from mainline-v5.13-rc1
commit b22a8f7b4bde4e4ab73b64908ffd5d90ecdcdbfd
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b22a8f7b4bde4e4ab73b64908ffd5d90ecdcdbfd

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
----------------------------------------------------------

John Paul reported a warning about bogus NUMA distance values spurred by
commit:

  620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort")

In this case, the afflicted machine comes up with a reported 256 possible
nodes, all of which are 0 distance away from one another.  This was
previously silently ignored, but is now caught by the aforementioned
commit.

The culprit is ia64's node_possible_map which remains unchanged from its
initialization value of NODE_MASK_ALL.  In John's case, the machine
doesn't have any SRAT nor SLIT table, but AIUI the possible map remains
untouched regardless of what ACPI tables end up being parsed.  Thus,
!online && possible nodes remain with a bogus distance of 0 (distances \in
[0, 9] are "reserved and have no meaning" as per the ACPI spec).

Follow x86 / drivers/base/arch_numa's example and set the possible map to
the parsed map, which in this case seems to be the online map.

Link: http://lore.kernel.org/r/255d6b5d-194e-eb0e-ecdd-97477a534441@physik.fu-berlin.de
Link: https://lkml.kernel.org/r/20210318130617.896309-1-valentin.schneider@arm.com
Fixes: 620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort")
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Reported-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Tested-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Tested-by: Sergei Trofimovich <slyfox@gentoo.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Anatoly Pugachev <matorola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 arch/ia64/kernel/acpi.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index a5636524af76..e2af6b172200 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -446,7 +446,8 @@ void __init acpi_numa_fixup(void)
 	if (srat_num_cpus == 0) {
 		node_set_online(0);
 		node_cpuid[0].phys_id = hard_smp_processor_id();
-		return;
+		slit_distance(0, 0) = LOCAL_DISTANCE;
+		goto out;
 	}
 
 	/*
@@ -489,7 +490,7 @@ void __init acpi_numa_fixup(void)
 			for (j = 0; j < MAX_NUMNODES; j++)
 				slit_distance(i, j) = i == j ?
 					LOCAL_DISTANCE : REMOTE_DISTANCE;
-		return;
+		goto out;
 	}
 
 	memset(numa_slit, -1, sizeof(numa_slit));
@@ -514,6 +515,8 @@ void __init acpi_numa_fixup(void)
 		printk("\n");
 	}
 #endif
+out:
+	node_possible_map = node_online_map;
 }
 #endif				/* CONFIG_ACPI_NUMA */
 
-- 
Gitee


From 501cfcd826bc0979d940d35886ff3031f9b6656e Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin@huawei.com>
Date: Mon, 15 Nov 2021 19:35:53 +0800
Subject: [PATCH 033/134] block: fix memory leak for mq shared sbitmap

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

If the blk_mq_sched_alloc_tags() -> blk_mq_alloc_rqs() call fails,
then we call blk_mq_free_rq_map(). But if BLK_MQ_F_TAG_HCTX_SHARED
has been set to hctx->flags, tags->bitmap_tags and
tags->breserved_tags will not be released and cause a memory leak.

Fixes: 31044c2e5b8de ("blk-mq-sched: Fix blk_mq_sched_alloc_tags() error
handling")
Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-mq-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 3fea30f0ea27..a3266541bd06 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -522,7 +522,7 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 
 	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 	if (ret) {
-		blk_mq_free_rq_map(hctx->sched_tags, set->flags);
+		blk_mq_free_rq_map(hctx->sched_tags, flags);
 		hctx->sched_tags = NULL;
 	}
 
-- 
Gitee


From e30e422336856649e063bec70b16edae0984c342 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 15 Nov 2021 19:35:55 +0800
Subject: [PATCH 034/134] fs: add vfs_parse_fs_param_source() helper

mainline inclusion
from mainline-5.14-rc2
commit d1d488d813703618f0dd93f0e4c4a05928114aa8
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d1d488d813703618f0dd93f0e4c4a05928114aa8

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Add a simple helper that filesystems can use in their parameter parser
to parse the "source" parameter. A few places open-coded this function
and that already caused a bug in the cgroup v1 parser that we fixed.
Let's make it harder to get this wrong by introducing a helper which
performs all necessary checks.

Link: https://syzkaller.appspot.com/bug?id=6312526aba5beae046fdae8f00399f87aab48b12
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Conflicts:
	include/linux/fs_context.h

Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/fs_context.c            | 54 +++++++++++++++++++++++++-------------
 include/linux/fs_context.h |  2 ++
 kernel/cgroup/cgroup-v1.c  | 14 ++++------
 3 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 4858645ca620..b7e43a780a62 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -79,6 +79,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
 	return -ENOPARAM;
 }
 
+/**
+ * vfs_parse_fs_param_source - Handle setting "source" via parameter
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * This is a simple helper for filesystems to verify that the "source" they
+ * accept is sane.
+ *
+ * Returns 0 on success, -ENOPARAM if this is not  "source" parameter, and
+ * -EINVAL otherwise. In the event of failure, supplementary error information
+ *  is logged.
+ */
+int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
+{
+	if (strcmp(param->key, "source") != 0)
+		return -ENOPARAM;
+
+	if (param->type != fs_value_is_string)
+		return invalf(fc, "Non-string source");
+
+	if (fc->source)
+		return invalf(fc, "Multiple sources");
+
+	fc->source = param->string;
+	param->string = NULL;
+	return 0;
+}
+EXPORT_SYMBOL(vfs_parse_fs_param_source);
+
 /**
  * vfs_parse_fs_param - Add a single parameter to a superblock config
  * @fc: The filesystem context to modify
@@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
 	/* If the filesystem doesn't take any arguments, give it the
 	 * default handling of source.
 	 */
-	if (strcmp(param->key, "source") == 0) {
-		if (param->type != fs_value_is_string)
-			return invalf(fc, "VFS: Non-string source");
-		if (fc->source)
-			return invalf(fc, "VFS: Multiple sources");
-		fc->source = param->string;
-		param->string = NULL;
-		return 0;
-	}
+	ret = vfs_parse_fs_param_source(fc, param);
+	if (ret != -ENOPARAM)
+		return ret;
 
 	return invalf(fc, "%s: Unknown parameter '%s'",
 		      fc->fs_type->name, param->key);
@@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	struct legacy_fs_context *ctx = fc->fs_private;
 	unsigned int size = ctx->data_size;
 	size_t len = 0;
+	int ret;
 
-	if (strcmp(param->key, "source") == 0) {
-		if (param->type != fs_value_is_string)
-			return invalf(fc, "VFS: Legacy: Non-string source");
-		if (fc->source)
-			return invalf(fc, "VFS: Legacy: Multiple sources");
-		fc->source = param->string;
-		param->string = NULL;
-		return 0;
-	}
+	ret = vfs_parse_fs_param_source(fc, param);
+	if (ret != -ENOPARAM)
+		return ret;
 
 	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
 		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 5b44b0195a28..6b54982fc5f3 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 extern int generic_parse_monolithic(struct fs_context *fc, void *data);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
+extern int vfs_parse_fs_param_source(struct fs_context *fc,
+				     struct fs_parameter *param);
 extern void fc_drop_locked(struct fs_context *fc);
 
 /*
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 812dd4ed0129..88b72815aa9d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -915,15 +915,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
 
 	opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
 	if (opt == -ENOPARAM) {
-		if (strcmp(param->key, "source") == 0) {
-			if (param->type != fs_value_is_string)
-				return invalf(fc, "Non-string source");
-			if (fc->source)
-				return invalf(fc, "Multiple sources not supported");
-			fc->source = param->string;
-			param->string = NULL;
-			return 0;
-		}
+		int ret;
+
+		ret = vfs_parse_fs_param_source(fc, param);
+		if (ret != -ENOPARAM)
+			return ret;
 		for_each_subsys(ss, i) {
 			if (strcmp(param->key, ss->legacy_name))
 				continue;
-- 
Gitee


From 848a2ff9d89acf7f7bc31e3c8ee418365d4accdc Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 15 Nov 2021 19:35:56 +0800
Subject: [PATCH 035/134] ramfs: fix mount source show for ramfs

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

ramfs_parse_param does not parse key "source", and will convert -ENOPARAM
to 0.  This will skip vfs_parse_fs_param_source in vfs_parse_fs_param,
which lead always "none" mount source for ramfs.  Fix it by parse "source"
in ramfs_parse_param like cgroup1_parse_param has do.

Link: https://lkml.kernel.org/r/20210924091756.1906118-1-yangerkun@huawei.com
Signed-off-by: yangerkun <yangerkun@huawei.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ramfs/inode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ee179a81b3da..0163dcff9318 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -193,17 +193,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	int opt;
 
 	opt = fs_parse(fc, ramfs_fs_parameters, param, &result);
-	if (opt < 0) {
+	if (opt == -ENOPARAM) {
+		opt = vfs_parse_fs_param_source(fc, param);
+		if (opt != -ENOPARAM)
+			return opt;
 		/*
 		 * We might like to report bad mount options here;
 		 * but traditionally ramfs has ignored all mount options,
 		 * and as it is used as a !CONFIG_SHMEM simple substitute
 		 * for tmpfs, better continue to ignore other mount options.
 		 */
-		if (opt == -ENOPARAM)
-			opt = 0;
-		return opt;
+		return 0;
 	}
+	if (opt < 0)
+		return opt;
 
 	switch (opt) {
 	case Opt_mode:
-- 
Gitee


From 5e8716e337144cac62d2f2daa9be8b423c717b03 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 15 Nov 2021 19:36:00 +0800
Subject: [PATCH 036/134] ext4: factor out write end code of inline file

mainline inclusion
from mainline-5.15-rc4
commit 6984aef59814fb5c47b0e30c56e101186b5ebf8c
category: perf
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6984aef59814fb5c47b0e30c56e101186b5ebf8c

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Now that the inline_data file write end procedure are falled into the
common write end functions, it is not clear. Factor them out and do
some cleanup. This patch also drop ext4_da_write_inline_data_end()
and switch to use ext4_write_inline_data_end() instead because we also
need to do the same error processing if we failed to write data into
inline entry.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20210716122024.1105856-4-yi.zhang@huawei.com

Conflicts:
	fs/ext4/inline.c
Reviewed-by: Yang Erkun <yangerkun@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/ext4.h   |   3 --
 fs/ext4/inline.c | 128 +++++++++++++++++++++++++----------------------
 fs/ext4/inode.c  |  73 +++++++++------------------
 3 files changed, 90 insertions(+), 114 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c859ec4544bc..bb3adca53d93 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3437,9 +3437,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
 					   unsigned flags,
 					   struct page **pagep,
 					   void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
-					 unsigned len, unsigned copied,
-					 struct page *page);
 extern int ext4_try_add_inline_entry(handle_t *handle,
 				     struct ext4_filename *fname,
 				     struct inode *dir, struct inode *inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a96b688a0410..cb42b2245c21 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -729,40 +729,83 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 			       unsigned copied, struct page *page)
 {
-	int ret, no_expand;
+	handle_t *handle = ext4_journal_current_handle();
+	int no_expand;
 	void *kaddr;
 	struct ext4_iloc iloc;
+	int ret = 0, ret2;
 
 	if (unlikely(copied < len) && !PageUptodate(page))
-		return 0;
+		copied = 0;
 
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret) {
-		ext4_std_error(inode->i_sb, ret);
-		return ret;
-	}
+	if (likely(copied)) {
+		ret = ext4_get_inode_loc(inode, &iloc);
+		if (ret) {
+			unlock_page(page);
+			put_page(page);
+			ext4_std_error(inode->i_sb, ret);
+			goto out;
+		}
+		ext4_write_lock_xattr(inode, &no_expand);
+		BUG_ON(!ext4_has_inline_data(inode));
 
-	ext4_write_lock_xattr(inode, &no_expand);
-	BUG_ON(!ext4_has_inline_data(inode));
+		/*
+		 * ei->i_inline_off may have changed since
+		 * ext4_write_begin() called
+		 * ext4_try_to_write_inline_data()
+		 */
+		(void) ext4_find_inline_data_nolock(inode);
 
-	/*
-	 * ei->i_inline_off may have changed since ext4_write_begin()
-	 * called ext4_try_to_write_inline_data()
-	 */
-	(void) ext4_find_inline_data_nolock(inode);
+		kaddr = kmap_atomic(page);
+		ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
+		kunmap_atomic(kaddr);
+		SetPageUptodate(page);
+		/* clear page dirty so that writepages wouldn't work for us. */
+		ClearPageDirty(page);
 
-	kaddr = kmap_atomic(page);
-	ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
-	kunmap_atomic(kaddr);
-	SetPageUptodate(page);
-	/* clear page dirty so that writepages wouldn't work for us. */
-	ClearPageDirty(page);
+		ext4_write_unlock_xattr(inode, &no_expand);
+		brelse(iloc.bh);
 
-	ext4_write_unlock_xattr(inode, &no_expand);
-	brelse(iloc.bh);
-	mark_inode_dirty(inode);
+		/*
+		 * It's important to update i_size while still holding page
+		 * lock: page writeout could otherwise come in and zero
+		 * beyond i_size.
+		 */
+		ext4_update_inode_size(inode, pos + copied);
+	}
+	unlock_page(page);
+	put_page(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (likely(copied))
+		mark_inode_dirty(inode);
+out:
+	/*
+	 * If we didn't copy as much data as expected, we need to trim back
+	 * size of xattr containing inline data.
+	 */
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		ext4_orphan_add(handle, inode);
 
-	return copied;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+	return ret ? ret : copied;
 }
 
 struct buffer_head *
@@ -943,43 +986,6 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
 	return ret;
 }
 
-int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
-				  unsigned len, unsigned copied,
-				  struct page *page)
-{
-	int ret;
-
-	ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
-	if (ret < 0) {
-		unlock_page(page);
-		put_page(page);
-		return ret;
-	}
-	copied = ret;
-
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_mutex.
-	 *
-	 * But it's important to update i_size while still holding page lock:
-	 * page writeout could otherwise come in and zero beyond i_size.
-	 */
-	if (pos+copied > inode->i_size)
-		i_size_write(inode, pos+copied);
-	unlock_page(page);
-	put_page(page);
-
-	/*
-	 * Don't mark the inode dirty under page lock. First, it unnecessarily
-	 * makes the holding time of page lock longer. Second, it forces lock
-	 * ordering of page lock and transaction start for journaling
-	 * filesystems.
-	 */
-	mark_inode_dirty(inode);
-
-	return copied;
-}
-
 #ifdef INLINE_DIR_DEBUG
 void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
 			  void *inline_start, int inline_size)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 16bc65609496..36663a118b6c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1283,23 +1283,14 @@ static int ext4_write_end(struct file *file,
 	loff_t old_size = inode->i_size;
 	int ret = 0, ret2;
 	int i_size_changed = 0;
-	int inline_data = ext4_has_inline_data(inode);
 	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_write_end(inode, pos, len, copied);
-	if (inline_data) {
-		ret = ext4_write_inline_data_end(inode, pos, len,
-						 copied, page);
-		if (ret < 0) {
-			unlock_page(page);
-			put_page(page);
-			goto errout;
-		}
-		copied = ret;
-		ret = 0;
-	} else
-		copied = block_write_end(file, mapping, pos,
-					 len, copied, page, fsdata);
+
+	if (ext4_has_inline_data(inode))
+		return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	/*
 	 * it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
@@ -1320,10 +1311,9 @@ static int ext4_write_end(struct file *file,
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
-	if (i_size_changed || inline_data)
+	if (i_size_changed)
 		ret = ext4_mark_inode_dirty(handle, inode);
 
-errout:
 	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
@@ -1395,7 +1385,6 @@ static int ext4_journalled_write_end(struct file *file,
 	int partial = 0;
 	unsigned from, to;
 	int size_changed = 0;
-	int inline_data = ext4_has_inline_data(inode);
 	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
@@ -1404,17 +1393,10 @@ static int ext4_journalled_write_end(struct file *file,
 
 	BUG_ON(!ext4_handle_valid(handle));
 
-	if (inline_data) {
-		ret = ext4_write_inline_data_end(inode, pos, len,
-						 copied, page);
-		if (ret < 0) {
-			unlock_page(page);
-			put_page(page);
-			goto errout;
-		}
-		copied = ret;
-		ret = 0;
-	} else if (unlikely(copied < len) && !PageUptodate(page)) {
+	if (ext4_has_inline_data(inode))
+		return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+	if (unlikely(copied < len) && !PageUptodate(page)) {
 		copied = 0;
 		ext4_journalled_zero_new_buffers(handle, page, from, to);
 	} else {
@@ -1437,13 +1419,12 @@ static int ext4_journalled_write_end(struct file *file,
 	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
 
-	if (size_changed || inline_data) {
+	if (size_changed) {
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
 	}
 
-errout:
 	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
@@ -3078,7 +3059,7 @@ static int ext4_da_write_end(struct file *file,
 			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	int ret = 0, ret2;
+	int ret;
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
@@ -3089,6 +3070,12 @@ static int ext4_da_write_end(struct file *file,
 				      len, copied, page, fsdata);
 
 	trace_ext4_da_write_end(inode, pos, len, copied);
+
+	if (write_mode != CONVERT_INLINE_DATA &&
+	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+	    ext4_has_inline_data(inode))
+		return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
 	start = pos & (PAGE_SIZE - 1);
 	end = start + copied - 1;
 
@@ -3108,26 +3095,12 @@ static int ext4_da_write_end(struct file *file,
 	 * ext4_da_write_inline_data_end().
 	 */
 	new_i_size = pos + copied;
-	if (copied && new_i_size > inode->i_size) {
-		if (ext4_has_inline_data(inode) ||
-		    ext4_da_should_update_i_disksize(page, end))
-			ext4_update_i_disksize(inode, new_i_size);
-	}
-
-	if (write_mode != CONVERT_INLINE_DATA &&
-	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
-	    ext4_has_inline_data(inode))
-		ret = ext4_da_write_inline_data_end(inode, pos, len, copied,
-						     page);
-	else
-		ret = generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-
-	copied = ret;
-	ret2 = ext4_journal_stop(handle);
-	if (unlikely(ret2 && !ret))
-		ret = ret2;
+	if (copied && new_i_size > inode->i_size &&
+	    ext4_da_should_update_i_disksize(page, end))
+		ext4_update_i_disksize(inode, new_i_size);
 
+	copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	ret = ext4_journal_stop(handle);
 	return ret ? ret : copied;
 }
 
-- 
Gitee


From cf86aec227618b09fbfc366f2fdd1f33c4e0e1d1 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 15 Nov 2021 19:36:01 +0800
Subject: [PATCH 037/134] ext4: drop unnecessary journal handle in delalloc
 write

mainline inclusion
from mainline-5.15-rc4
commit cc883236b79297f6266ca6f4e7f24f3fd3c736c1
category: perf
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cc883236b79297f6266ca6f4e7f24f3fd3c736c1

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

After we factor out the inline data write procedure from
ext4_da_write_end(), we don't need to start journal handle for the cases
of both buffer overwrite and append-write. If we need to update
i_disksize, mark_inode_dirty() do start handle and update inode buffer.
So we could just remove all the journal handle codes in the delalloc
write procedure.

After this patch, we could get a lot of performance improvement. Below
is the Unixbench comparison data test on my machine with 'Intel Xeon
Gold 5120' CPU and nvme SSD backend.

Test cmd:

  ./Run -c 56 -i 3 fstime fsbuffer fsdisk

Before this patch:

  System Benchmarks Partial Index           BASELINE       RESULT   INDEX
  File Copy 1024 bufsize 2000 maxblocks       3960.0     422965.0   1068.1
  File Copy 256 bufsize 500 maxblocks         1655.0     105077.0   634.9
  File Copy 4096 bufsize 8000 maxblocks       5800.0    1429092.0   2464.0
                                                                    ======
  System Benchmarks Index Score (Partial Only)                      1186.6

After this patch:

  System Benchmarks Partial Index           BASELINE       RESULT   INDEX
  File Copy 1024 bufsize 2000 maxblocks       3960.0     732716.0   1850.3
  File Copy 256 bufsize 500 maxblocks         1655.0     184940.0   1117.5
  File Copy 4096 bufsize 8000 maxblocks       5800.0    2427152.0   4184.7
                                                                    ======
  System Benchmarks Index Score (Partial Only)                      2053.0

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20210716122024.1105856-5-yi.zhang@huawei.com
Reviewed-by: Yang Erkun <yangerkun@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/inode.c | 60 +++++--------------------------------------------
 1 file changed, 5 insertions(+), 55 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 36663a118b6c..dd137a1c648f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2916,19 +2916,6 @@ static int ext4_nonda_switch(struct super_block *sb)
 	return 0;
 }
 
-/* We always reserve for an inode update; the superblock could be there too */
-static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
-{
-	if (likely(ext4_has_feature_large_file(inode->i_sb)))
-		return 1;
-
-	if (pos + len <= 0x7fffffffULL)
-		return 1;
-
-	/* We might need to update the superblock to set LARGE_FILE */
-	return 2;
-}
-
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
@@ -2937,7 +2924,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	pgoff_t index;
 	struct inode *inode = mapping->host;
-	handle_t *handle;
 
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
@@ -2963,41 +2949,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			return 0;
 	}
 
-	/*
-	 * grab_cache_page_write_begin() can take a long time if the
-	 * system is thrashing due to memory pressure, or if the page
-	 * is being written back.  So grab it first before we start
-	 * the transaction handle.  This also allows us to allocate
-	 * the page (if needed) without using GFP_NOFS.
-	 */
-retry_grab:
+retry:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
-	unlock_page(page);
-
-	/*
-	 * With delayed allocation, we don't log the i_disksize update
-	 * if there is delayed block allocation. But we still need
-	 * to journalling the i_disksize update if writes to the end
-	 * of file which has an already mapped buffer.
-	 */
-retry_journal:
-	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
-				ext4_da_write_credits(inode, pos, len));
-	if (IS_ERR(handle)) {
-		put_page(page);
-		return PTR_ERR(handle);
-	}
 
-	lock_page(page);
-	if (page->mapping != mapping) {
-		/* The page got truncated from under us */
-		unlock_page(page);
-		put_page(page);
-		ext4_journal_stop(handle);
-		goto retry_grab;
-	}
 	/* In case writeback began while the page was unlocked */
 	wait_for_stable_page(page);
 
@@ -3009,20 +2965,18 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 #endif
 	if (ret < 0) {
 		unlock_page(page);
-		ext4_journal_stop(handle);
+		put_page(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
-		 * i_size_read because we hold i_mutex.
+		 * i_size_read because we hold inode lock.
 		 */
 		if (pos + len > inode->i_size)
 			ext4_truncate_failed_write(inode);
 
 		if (ret == -ENOSPC &&
 		    ext4_should_retry_alloc(inode->i_sb, &retries))
-			goto retry_journal;
-
-		put_page(page);
+			goto retry;
 		return ret;
 	}
 
@@ -3059,8 +3013,6 @@ static int ext4_da_write_end(struct file *file,
 			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	int ret;
-	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
@@ -3099,9 +3051,7 @@ static int ext4_da_write_end(struct file *file,
 	    ext4_da_should_update_i_disksize(page, end))
 		ext4_update_i_disksize(inode, new_i_size);
 
-	copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	ret = ext4_journal_stop(handle);
-	return ret ? ret : copied;
+	return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 }
 
 /*
-- 
Gitee


From 382e8f3d498541f32b002802deb556512c44b77c Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Mon, 15 Nov 2021 19:36:03 +0800
Subject: [PATCH 038/134] powerpc/powernv/pci: fix a RCU-list lock

mainline inclusion
from mainline-v5.12-rc1
commit c9790fb5df461c91d3fff1d864c1acb8baf5ad5c
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c9790fb5df461c91d3fff1d864c1acb8baf5ad5c

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

It is unsafe to traverse tbl->it_group_list without the RCU read lock.

 WARNING: suspicious RCU usage
 5.7.0-rc4-next-20200508 #1 Not tainted
 -----------------------------
 arch/powerpc/platforms/powernv/pci-ioda-tce.c:355 RCU-list traversed in non-reader section!!

 other info that might help us debug this:

 rcu_scheduler_active = 2, debug_locks = 1
 3 locks held by qemu-kvm/4305:
  #0: c000000bc3fe6988 (&container->group_lock){++++}-{3:3}, at: vfio_fops_unl_ioctl+0x108/0x410 [vfio]
  #1: c00800000fcc7400 (&vfio.iommu_drivers_lock){+.+.}-{3:3}, at: vfio_fops_unl_ioctl+0x148/0x410 [vfio]
  #2: c000000bc3fe4d68 (&container->lock){+.+.}-{3:3}, at: tce_iommu_attach_group+0x3c/0x4f0 [vfio_iommu_spapr_tce]

 stack backtrace:
 CPU: 4 PID: 4305 Comm: qemu-kvm Not tainted 5.7.0-rc4-next-20200508 #1
 Call Trace:
 [c0000010f29afa60] [c0000000007154c8] dump_stack+0xfc/0x174 (unreliable)
 [c0000010f29afab0] [c0000000001d8ff0] lockdep_rcu_suspicious+0x140/0x164
 [c0000010f29afb30] [c0000000000dae2c] pnv_pci_unlink_table_and_group+0x11c/0x200
 [c0000010f29afb70] [c0000000000d4a34] pnv_pci_ioda2_unset_window+0xc4/0x190
 [c0000010f29afbf0] [c0000000000d4b4c] pnv_ioda2_take_ownership+0x4c/0xd0
 [c0000010f29afc30] [c00800000fd60ee0] tce_iommu_attach_group+0x2c8/0x4f0 [vfio_iommu_spapr_tce]
 [c0000010f29afcd0] [c00800000fcc11a0] vfio_fops_unl_ioctl+0x238/0x410 [vfio]
 [c0000010f29afd50] [c0000000005430a8] ksys_ioctl+0xd8/0x130
 [c0000010f29afda0] [c000000000543128] sys_ioctl+0x28/0x40
 [c0000010f29afdc0] [c000000000038af4] system_call_exception+0x114/0x1e0
 [c0000010f29afe20] [c00000000000c8f0] system_call_common+0xf0/0x278

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200510051347.1906-1-cai@lca.pw
Signed-off-by: Chen Lifu <chenlifu@huawei.com>
Reviewed-by: Zhang Jianhua <chris.zjh@huawei.com>
Reviewed-by: He Ying <heying24@huawei.com>
Reviewed-by: Yihang Xu <xuyihang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 5218f5da2737..30551bbd7988 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -380,6 +380,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
 
 	/* Remove link to a group from table's list of attached groups */
 	found = false;
+
+	rcu_read_lock();
 	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
 		if (tgl->table_group == table_group) {
 			list_del_rcu(&tgl->next);
@@ -388,6 +390,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
 			break;
 		}
 	}
+	rcu_read_unlock();
+
 	if (WARN_ON(!found))
 		return;
 
-- 
Gitee


From f75e5a6455f0749885ee4cf500c5fa29d5ba50a8 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Mon, 15 Nov 2021 19:36:04 +0800
Subject: [PATCH 039/134] powerpc/numa: Update cpu_cpu_map on CPU
 online/offline

mainline inclusion
from mainline-v5.15-rc1
commit 9a245d0e1f006bc7ccf0285d0d520ed304d00c4a
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9a245d0e1f006bc7ccf0285d0d520ed304d00c4a

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

cpu_cpu_map holds all the CPUs in the DIE. However in PowerPC, when
onlining/offlining of CPUs, this mask doesn't get updated.  This mask
is however updated when CPUs are added/removed. So when both
operations like online/offline of CPUs and adding/removing of CPUs are
done simultaneously, then cpumaps end up broken.

WARNING: CPU: 13 PID: 1142 at kernel/sched/topology.c:898
build_sched_domains+0xd48/0x1720
Modules linked in: rpadlpar_io rpaphp mptcp_diag xsk_diag tcp_diag
udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag
bonding tls nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set
rfkill nf_tables nfnetlink pseries_rng xts vmx_crypto uio_pdrv_genirq
uio binfmt_misc ip_tables xfs libcrc32c dm_service_time sd_mod t10_pi sg
ibmvfc scsi_transport_fc ibmveth dm_multipath dm_mirror dm_region_hash
dm_log dm_mod fuse
CPU: 13 PID: 1142 Comm: kworker/13:2 Not tainted 5.13.0-rc6+ #28
Workqueue: events cpuset_hotplug_workfn
NIP:  c0000000001caac8 LR: c0000000001caac4 CTR: 00000000007088ec
REGS: c00000005596f220 TRAP: 0700   Not tainted  (5.13.0-rc6+)
MSR:  8000000000029033 <SF,EE,ME,IR,DR,RI,LE>  CR: 48828222  XER:
00000009
CFAR: c0000000001ea698 IRQMASK: 0
GPR00: c0000000001caac4 c00000005596f4c0 c000000001c4a400 0000000000000036
GPR04: 00000000fffdffff c00000005596f1d0 0000000000000027 c0000018cfd07f90
GPR08: 0000000000000023 0000000000000001 0000000000000027 c0000018fe68ffe8
GPR12: 0000000000008000 c00000001e9d1880 c00000013a047200 0000000000000800
GPR16: c000000001d3c7d0 0000000000000240 0000000000000048 c000000010aacd18
GPR20: 0000000000000001 c000000010aacc18 c00000013a047c00 c000000139ec2400
GPR24: 0000000000000280 c000000139ec2520 c000000136c1b400 c000000001c93060
GPR28: c00000013a047c20 c000000001d3c6c0 c000000001c978a0 000000000000000d
NIP [c0000000001caac8] build_sched_domains+0xd48/0x1720
LR [c0000000001caac4] build_sched_domains+0xd44/0x1720
Call Trace:
[c00000005596f4c0] [c0000000001caac4] build_sched_domains+0xd44/0x1720 (unreliable)
[c00000005596f670] [c0000000001cc5ec] partition_sched_domains_locked+0x3ac/0x4b0
[c00000005596f710] [c0000000002804e4] rebuild_sched_domains_locked+0x404/0x9e0
[c00000005596f810] [c000000000283e60] rebuild_sched_domains+0x40/0x70
[c00000005596f840] [c000000000284124] cpuset_hotplug_workfn+0x294/0xf10
[c00000005596fc60] [c000000000175040] process_one_work+0x290/0x590
[c00000005596fd00] [c0000000001753c8] worker_thread+0x88/0x620
[c00000005596fda0] [c000000000181704] kthread+0x194/0x1a0
[c00000005596fe10] [c00000000000ccec] ret_from_kernel_thread+0x5c/0x70
Instruction dump:
485af049 60000000 2fa30800 409e0028 80fe0000 e89a00f8 e86100e8 38da0120
7f88e378 7ce53b78 4801fb91 60000000 <0fe00000> 39000000 38e00000 38c00000

Fix this by updating cpu_cpu_map aka cpumask_of_node() on every CPU
online/offline.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210826100521.412639-5-srikar@linux.vnet.ibm.com
Signed-off-by: Chen Lifu <chenlifu@huawei.com>
Reviewed-by: Zhang Jianhua <chris.zjh@huawei.com>
Reviewed-by: Linruizhe <linruizhe@huawei.com>
Reviewed-by: He Ying <heying24@huawei.com>
Reviewed-by: Yihang Xu <xuyihang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 arch/powerpc/include/asm/topology.h | 13 +++++++++++++
 arch/powerpc/kernel/smp.c           |  3 +++
 arch/powerpc/mm/numa.c              |  7 ++-----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 3beeb030cd78..424635b24b52 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -65,6 +65,11 @@ static inline int early_cpu_to_node(int cpu)
 
 int of_drconf_to_nid_single(struct drmem_lmb *lmb);
 
+extern void map_cpu_to_node(int cpu, int node);
+#ifdef CONFIG_HOTPLUG_CPU
+extern void unmap_cpu_from_node(unsigned long cpu);
+#endif /* CONFIG_HOTPLUG_CPU */
+
 #else
 
 static inline int early_cpu_to_node(int cpu) { return 0; }
@@ -93,6 +98,14 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 	return first_online_node;
 }
 
+
+#ifdef CONFIG_SMP
+static inline void map_cpu_to_node(int cpu, int node) {}
+#ifdef CONFIG_HOTPLUG_CPU
+static inline void unmap_cpu_from_node(unsigned long cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */
+
 #endif /* CONFIG_NUMA */
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 452cbf98bfd7..b7bed52a5a73 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1300,6 +1300,8 @@ static void remove_cpu_from_masks(int cpu)
 	struct cpumask *(*mask_fn)(int) = cpu_sibling_mask;
 	int i;
 
+	unmap_cpu_from_node(cpu);
+
 	if (shared_caches)
 		mask_fn = cpu_l2_cache_mask;
 
@@ -1384,6 +1386,7 @@ static void add_cpu_to_masks(int cpu)
 	 * This CPU will not be in the online mask yet so we need to manually
 	 * add it to it's own thread sibling mask.
 	 */
+	map_cpu_to_node(cpu, cpu_to_node(cpu));
 	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
 	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 094a1076fd1f..23bfe00811ae 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -137,7 +137,7 @@ static void reset_numa_cpu_lookup_table(void)
 		numa_cpu_lookup_table[cpu] = -1;
 }
 
-static void map_cpu_to_node(int cpu, int node)
+void map_cpu_to_node(int cpu, int node)
 {
 	update_numa_cpu_lookup_table(cpu, node);
 
@@ -148,7 +148,7 @@ static void map_cpu_to_node(int cpu, int node)
 }
 
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
-static void unmap_cpu_from_node(unsigned long cpu)
+void unmap_cpu_from_node(unsigned long cpu)
 {
 	int node = numa_cpu_lookup_table[cpu];
 
@@ -598,9 +598,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu)
 
 static int ppc_numa_cpu_dead(unsigned int cpu)
 {
-#ifdef CONFIG_HOTPLUG_CPU
-	unmap_cpu_from_node(cpu);
-#endif
 	return 0;
 }
 
-- 
Gitee


From c9ed5be0f1a10e6b2c40330f2af4f99b4d286983 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 15 Nov 2021 19:36:05 +0800
Subject: [PATCH 040/134] mm, vmscan: guarantee drop_slab_node() termination

mainline inclusion
from mainline-v5.15-rc1
commit 1399af7e54896c774d67f1c1acc491b07149421d
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1399af7e54896c774d67f1c1acc491b07149421d

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

drop_slab_node() is called as part of echo 2>/proc/sys/vm/drop_caches
operation.  It iterates over all memcgs and calls shrink_slab() which in
turn iterates over all slab shrinkers.  Freed objects are counted and as
long as the total number of freed objects from all memcgs and shrinkers is
higher than 10, drop_slab_node() loops for another full memcgs*shrinkers
iteration.

This arbitrary constant threshold of 10 can result in effectively an
infinite loop on a system with large number of memcgs and/or parallel
activity that allocates new objects.  This has been reported previously by
Chunxin Zang [1] and recently by our customer.

The previous report [1] has resulted in commit 069c411de40a ("mm/vmscan:
fix infinite loop in drop_slab_node") which added a check for signals
allowing the user to terminate the command writing to drop_caches.  At the
time it was also considered to make the threshold grow with each iteration
to guarantee termination, but such patch hasn't been formally proposed
yet.

This patch implements the dynamically growing threshold.  At first
iteration it's enough to free one object to continue, and this threshold
effectively doubles with each iteration.  Our customer's feedback was
positive.

There is always a risk that this change will result on some system in a
previously terminating drop_caches operation to terminate sooner and free
fewer objects.  Ideally the semantics would guarantee freeing all freeable
objects that existed at the moment of starting the operation, while not
looping forever for newly allocated objects, but that's not feasible to
track.  In the less ideal solution based on thresholds, arguably the
termination guarantee is more important than the exhaustiveness guarantee.
If there are reports of large regression wrt being exhaustive, we can
tune how fast the threshold grows.

[1] https://lore.kernel.org/lkml/20200909152047.27905-1-zangchunxin@bytedance.com/T/#u

[vbabka@suse.cz: avoid undefined shift behaviour]
  Link: https://lkml.kernel.org/r/2f034e6f-a753-550a-f374-e4e23899d3d5@suse.cz

Link: https://lkml.kernel.org/r/20210818152239.25502-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Chunxin Zang <zangchunxin@bytedance.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ebaa10d4dcad..9f292132ed88 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -689,6 +689,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 void drop_slab_node(int nid)
 {
 	unsigned long freed;
+	int shift = 0;
 
 	do {
 		struct mem_cgroup *memcg = NULL;
@@ -701,7 +702,7 @@ void drop_slab_node(int nid)
 		do {
 			freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
 		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-	} while (freed > 10);
+	} while ((freed >> shift++) > 1);
 }
 
 void drop_slab(void)
-- 
Gitee


From e60d685e36f102c405183d2dbac196ba658cfb7f Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 15 Nov 2021 19:36:06 +0800
Subject: [PATCH 041/134] mm: memcontrol: set the correct memcg swappiness
 restriction

mainline inclusion
from mainline-5.15-rc1
commit 37bc3cb9bbef86d1ddbbc789e55b588c8a2cac26
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37bc3cb9bbef86d1ddbbc789e55b588c8a2cac26

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

Since commit c843966c556d ("mm: allow swappiness that prefers reclaiming
anon over the file workingset") has expended the swappiness value to make
swap to be preferred in some systems.  We should also change the memcg
swappiness restriction to allow memcg swap-preferred.

Link: https://lkml.kernel.org/r/d77469b90c45c49953ccbc51e54a1d465bc18f70.1627626255.git.baolin.wang@linux.alibaba.com
Fixes: c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset")
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	mm/memcontrol.c
Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00156aebbcad..167169b3907d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4184,7 +4184,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-	if (val > 100)
+	if (val > 200)
 		return -EINVAL;
 
 	if (css->parent)
-- 
Gitee


From 6a07e27e948035e4b75ab802abb26b4c33bfc760 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Fri, 3 Dec 2021 18:15:16 +0800
Subject: [PATCH 042/134] hugetlb: before freeing hugetlb page set dtor to
 appropriate value

mainline inclusion
from mainline-v5.15-rc1
commit e32d20c0c88b1cd0a44f882c4f0eb2f536363d1b
category: bugfix
issue: #I4PYGY
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e32d20c0c88b1cd0a44f882c4f0eb2f536363d1b

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
----------------------------------------------------------------------

When removing a hugetlb page from the pool the ref count is set to one (as
the free page has no ref count) and compound page destructor is set to
NULL_COMPOUND_DTOR.  Since a subsequent call to free the hugetlb page will
call __free_pages for non-gigantic pages and free_gigantic_page for
gigantic pages the destructor is not used.

However, consider the following race with code taking a speculative
reference on the page:

Thread 0				Thread 1
--------				--------
remove_hugetlb_page
  set_page_refcounted(page);
  set_compound_page_dtor(page,
           NULL_COMPOUND_DTOR);
					get_page_unless_zero(page)
__update_and_free_page
  __free_pages(page,
           huge_page_order(h));

		/* Note that __free_pages() will simply drop
		   the reference to the page. */

					put_page(page)
					  __put_compound_page()
					    destroy_compound_page
					      NULL_COMPOUND_DTOR
						BUG: kernel NULL pointer
						dereference, address:
						0000000000000000

To address this race, set the dtor to the normal compound page dtor for
non-gigantic pages.  The dtor for gigantic pages does not matter as
gigantic pages are changed from a compound page to 'just a group of pages'
before freeing.  Hence, the destructor is not used.

Link: https://lkml.kernel.org/r/20210809184832.18342-4-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Mina Almasry <almasrymina@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Chen Wandun <chenwandun@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Conflicts:
	mm/hugetlb.c
[use update_and_free_page instead of remove_hugetlb_page]
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 mm/hugetlb.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 17679e80fbc4..4ecf9a0622df 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1353,9 +1353,26 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 	}
 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
-	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+	/*
+	 * Very subtle
+	 *
+	 * For non-gigantic pages set the destructor to the normal compound
+	 * page dtor.  This is needed in case someone takes an additional
+	 * temporary ref to the page, and freeing is delayed until they drop
+	 * their reference.
+	 *
+	 * For gigantic pages set the destructor to the null dtor.  This
+	 * destructor will never be called.  Before freeing the gigantic
+	 * page destroy_compound_gigantic_page will turn the compound page
+	 * into a simple group of pages.  After this the destructor does not
+	 * apply.
+	 *
+	 * This handles the case where more than one ref is held when and
+	 * after update_and_free_page is called.
+	 */
 	set_page_refcounted(page);
 	if (hstate_is_gigantic(h)) {
+		set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
 		/*
 		 * Temporarily drop the hugetlb_lock, because
 		 * we might block in free_gigantic_page().
@@ -1365,6 +1382,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 		free_gigantic_page(page, huge_page_order(h));
 		spin_lock(&hugetlb_lock);
 	} else {
+		set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
 		__free_pages(page, huge_page_order(h));
 	}
 }
-- 
Gitee


From 0f7a6061e38f45eb537121453a27d7fe9f428052 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 15 Nov 2021 19:39:01 +0800
Subject: [PATCH 043/134] sched/topology: Fix sched_domain_topology_level alloc
 in sched_init_numa()

mainline inclusion
from mainline-v5.12-rc1
commit 71e5f6644fb2f3304fcb310145ded234a37e7cc1
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=71e5f6644fb2f3304fcb310145ded234a37e7cc1

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
----------------------------------------------------------

Commit "sched/topology: Make sched_init_numa() use a set for the
deduplicating sort" allocates 'i + nr_levels (level)' instead of
'i + nr_levels + 1' sched_domain_topology_level.

This led to an Oops (on Arm64 juno with CONFIG_SCHED_DEBUG):

sched_init_domains
  build_sched_domains()
    __free_domain_allocs()
      __sdt_free() {
	...
        for_each_sd_topology(tl)
	  ...
          sd = *per_cpu_ptr(sdd->sd, j); <--
	  ...
      }

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Barry Song <song.bao.hua@hisilicon.com>
Link: https://lkml.kernel.org/r/6000e39e-7d28-c360-9cd6-8798fd22a9bf@arm.com
Fixes: 620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort")
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/sched/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 84cf39842efe..004e9505f7ad 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1718,7 +1718,7 @@ void sched_init_numa(void)
 	/* Compute default topology size */
 	for (i = 0; sched_domain_topology[i].mask; i++);
 
-	tl = kzalloc((i + nr_levels) *
+	tl = kzalloc((i + nr_levels + 1) *
 			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
 	if (!tl)
 		return;
-- 
Gitee


From ffcd1a14601969fed352e2f89da2432ea1111d6b Mon Sep 17 00:00:00 2001
From: Li Xinhai <lixinhai.lxh@gmail.com>
Date: Mon, 15 Nov 2021 19:40:32 +0800
Subject: [PATCH 044/134] mm: rmap: explicitly reset vma->anon_vma in
 unlink_anon_vmas()

mainline inclusion
from mainline-v5.12-rc1
commit ee8ab1903e3d912d8f10bedbf96c3b6a1c8cbede
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ee8ab1903e3d912d8f10bedbf96c3b6a1c8cbede

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

In case the vma will continue to be used after unlink its relevant
anon_vma, we need to reset the vma->anon_vma pointer to NULL.  So, later
when fault happen within this vma again, a new anon_vma will be prepared.

By this way, the vma will only be checked for reverse mapping of pages
which been fault in after the unlink_anon_vmas call.

Currently, the mremap with MREMAP_DONTUNMAP scenario will continue use the
vma after moved its page table entries to a new vma.  For other scenarios,
the vma itself will be freed after call unlink_anon_vmas.

Link: https://lkml.kernel.org/r/20210119075126.3513154-1-lixinhai.lxh@gmail.com
Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Reviewed-by: tong tiangen <tongtiangen@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 mm/rmap.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 14f84f70c557..cdf549f6f617 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -413,8 +413,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 		list_del(&avc->same_vma);
 		anon_vma_chain_free(avc);
 	}
-	if (vma->anon_vma)
+	if (vma->anon_vma) {
 		vma->anon_vma->degree--;
+
+		/*
+		 * vma would still be needed after unlink, and anon_vma will be prepared
+		 * when handle fault.
+		 */
+		vma->anon_vma = NULL;
+	}
 	unlock_anon_vma_root(root);
 
 	/*
-- 
Gitee


From 9c440f6bb3ff9994fb38773aca15b25f64edb7af Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 15 Nov 2021 19:40:33 +0800
Subject: [PATCH 045/134] quota: check block number when reading the block in
 quota file

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

The block number in the quota tree on disk should be smaller than the
v2_disk_dqinfo.dqi_blocks. If the quota file was corrupted, we may be
allocating an 'allocated' block and that would lead to a loop in a tree,
which will probably trigger oops later. This patch adds a check for the
block number in the quota tree to prevent such potential issue.

Link: https://lore.kernel.org/r/20211008093821.1001186-2-yi.zhang@huawei.com
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Cc: stable@kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/quota/quota_tree.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index c5562c871c8b..c459ac0cb2c7 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -488,6 +488,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
+		quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+			    newblk, info->dqi_blocks);
+		ret = -EUCLEAN;
+		goto out_buf;
+	}
+
 	if (depth == info->dqi_qtree_depth - 1) {
 		ret = free_dqentry(info, dquot, newblk);
 		newblk = 0;
@@ -587,6 +594,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 	blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
 	if (!blk)	/* No reference? */
 		goto out_buf;
+	if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
+		quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+			    blk, info->dqi_blocks);
+		ret = -EUCLEAN;
+		goto out_buf;
+	}
+
 	if (depth < info->dqi_qtree_depth - 1)
 		ret = find_tree_dqentry(info, dquot, blk, depth+1);
 	else
-- 
Gitee


From 76b5c1db4b91ff66910b8240e68babe2d0d708b7 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 15 Nov 2021 19:40:34 +0800
Subject: [PATCH 046/134] quota: correct error number in free_dqentry()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Fix the error path in free_dqentry(), pass out the error number if the
block to free is not correct.

Fixes: 1ccd14b9c271 ("quota: Split off quota tree handling into a separate file")
Link: https://lore.kernel.org/r/20211008093821.1001186-3-yi.zhang@huawei.com
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Cc: stable@kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/quota/quota_tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index c459ac0cb2c7..1a188fbdf34e 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -423,6 +423,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		quota_error(dquot->dq_sb, "Quota structure has offset to "
 			"other block (%u) than it should (%u)", blk,
 			(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		ret = -EIO;
 		goto out_buf;
 	}
 	ret = read_blk(info, blk, buf);
-- 
Gitee


From 223ad2904adcf704fe7694af2bc068404321eacb Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 15 Nov 2021 19:40:35 +0800
Subject: [PATCH 047/134] ext4: check for out-of-order index extents in
 ext4_valid_extent_entries()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

After commit 5946d089379a ("ext4: check for overlapping extents in
ext4_valid_extent_entries()"), we can check out the overlapping extent
entry in leaf extent blocks. But the out-of-order extent entry in index
extent blocks could also trigger bad things if the filesystem is
inconsistent. So this patch add a check to figure out the out-of-order
index extents and return error.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20210908120850.4012324-2-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aa4d74f9d162..1d09a4c295f7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -356,6 +356,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
 				     ext4_fsblk_t *pblk, int depth)
 {
 	unsigned short entries;
+	ext4_lblk_t lblock = 0;
+	ext4_lblk_t prev = 0;
+
 	if (eh->eh_entries == 0)
 		return 1;
 
@@ -364,31 +367,35 @@ static int ext4_valid_extent_entries(struct inode *inode,
 	if (depth == 0) {
 		/* leaf entries */
 		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
-		ext4_lblk_t lblock = 0;
-		ext4_lblk_t prev = 0;
-		int len = 0;
 		while (entries) {
 			if (!ext4_valid_extent(inode, ext))
 				return 0;
 
 			/* Check for overlapping extents */
 			lblock = le32_to_cpu(ext->ee_block);
-			len = ext4_ext_get_actual_len(ext);
 			if ((lblock <= prev) && prev) {
 				*pblk = ext4_ext_pblock(ext);
 				return 0;
 			}
+			prev = lblock + ext4_ext_get_actual_len(ext) - 1;
 			ext++;
 			entries--;
-			prev = lblock + len - 1;
 		}
 	} else {
 		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 		while (entries) {
 			if (!ext4_valid_extent_idx(inode, ext_idx))
 				return 0;
+
+			/* Check for overlapping index extents */
+			lblock = le32_to_cpu(ext_idx->ei_block);
+			if ((lblock <= prev) && prev) {
+				*pblk = ext4_idx_pblock(ext_idx);
+				return 0;
+			}
 			ext_idx++;
 			entries--;
+			prev = lblock;
 		}
 	}
 	return 1;
-- 
Gitee


From 2472299e578d8e5540d9384d97f31cc8bb855b20 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 15 Nov 2021 19:40:36 +0800
Subject: [PATCH 048/134] ext4: check for inconsistent extents between index
 and leaf block

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Now that we can check out overlapping extents in leaf block and
out-of-order index extents in index block. But the .ee_block in the
first extent of one leaf block should equal to the .ei_block in it's
parent index extent entry. This patch add a check to verify such
inconsistent between the index and leaf block.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20210908120850.4012324-3-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 59 +++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 1d09a4c295f7..39ea4f0b240c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -353,7 +353,8 @@ static int ext4_valid_extent_idx(struct inode *inode,
 
 static int ext4_valid_extent_entries(struct inode *inode,
 				     struct ext4_extent_header *eh,
-				     ext4_fsblk_t *pblk, int depth)
+				     ext4_lblk_t lblk, ext4_fsblk_t *pblk,
+				     int depth)
 {
 	unsigned short entries;
 	ext4_lblk_t lblock = 0;
@@ -367,6 +368,14 @@ static int ext4_valid_extent_entries(struct inode *inode,
 	if (depth == 0) {
 		/* leaf entries */
 		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+
+		/*
+		 * The logical block in the first entry should equal to
+		 * the number in the index block.
+		 */
+		if (depth != ext_depth(inode) &&
+		    lblk != le32_to_cpu(ext->ee_block))
+			return 0;
 		while (entries) {
 			if (!ext4_valid_extent(inode, ext))
 				return 0;
@@ -383,6 +392,14 @@ static int ext4_valid_extent_entries(struct inode *inode,
 		}
 	} else {
 		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+		/*
+		 * The logical block in the first entry should equal to
+		 * the number in the parent index block.
+		 */
+		if (depth != ext_depth(inode) &&
+		    lblk != le32_to_cpu(ext_idx->ei_block))
+			return 0;
 		while (entries) {
 			if (!ext4_valid_extent_idx(inode, ext_idx))
 				return 0;
@@ -403,7 +420,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 
 static int __ext4_ext_check(const char *function, unsigned int line,
 			    struct inode *inode, struct ext4_extent_header *eh,
-			    int depth, ext4_fsblk_t pblk)
+			    int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
 {
 	const char *error_msg;
 	int max = 0, err = -EFSCORRUPTED;
@@ -429,7 +446,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
 		error_msg = "invalid eh_entries";
 		goto corrupted;
 	}
-	if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
+	if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
 		error_msg = "invalid extent entries";
 		goto corrupted;
 	}
@@ -459,7 +476,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
 }
 
 #define ext4_ext_check(inode, eh, depth, pblk)			\
-	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
 
 int ext4_ext_check_inode(struct inode *inode)
 {
@@ -492,16 +509,18 @@ static void ext4_cache_extents(struct inode *inode,
 
 static struct buffer_head *
 __read_extent_tree_block(const char *function, unsigned int line,
-			 struct inode *inode, ext4_fsblk_t pblk, int depth,
-			 int flags)
+			 struct inode *inode, struct ext4_extent_idx *idx,
+			 int depth, int flags)
 {
 	struct buffer_head		*bh;
 	int				err;
 	gfp_t				gfp_flags = __GFP_MOVABLE | GFP_NOFS;
+	ext4_fsblk_t			pblk;
 
 	if (flags & EXT4_EX_NOFAIL)
 		gfp_flags |= __GFP_NOFAIL;
 
+	pblk = ext4_idx_pblock(idx);
 	bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
 	if (unlikely(!bh))
 		return ERR_PTR(-ENOMEM);
@@ -514,8 +533,8 @@ __read_extent_tree_block(const char *function, unsigned int line,
 	}
 	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
 		return bh;
-	err = __ext4_ext_check(function, line, inode,
-			       ext_block_hdr(bh), depth, pblk);
+	err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+			       depth, pblk, le32_to_cpu(idx->ei_block));
 	if (err)
 		goto errout;
 	set_buffer_verified(bh);
@@ -533,8 +552,8 @@ __read_extent_tree_block(const char *function, unsigned int line,
 
 }
 
-#define read_extent_tree_block(inode, pblk, depth, flags)		\
-	__read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
+#define read_extent_tree_block(inode, idx, depth, flags)		\
+	__read_extent_tree_block(__func__, __LINE__, (inode), (idx),	\
 				 (depth), (flags))
 
 /*
@@ -584,8 +603,7 @@ int ext4_ext_precache(struct inode *inode)
 			i--;
 			continue;
 		}
-		bh = read_extent_tree_block(inode,
-					    ext4_idx_pblock(path[i].p_idx++),
+		bh = read_extent_tree_block(inode, path[i].p_idx++,
 					    depth - i - 1,
 					    EXT4_EX_FORCE_CACHE);
 		if (IS_ERR(bh)) {
@@ -890,8 +908,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
-		bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
-					    flags);
+		bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
 		if (IS_ERR(bh)) {
 			ret = PTR_ERR(bh);
 			goto err;
@@ -1496,7 +1513,6 @@ static int ext4_ext_search_right(struct inode *inode,
 	struct ext4_extent_header *eh;
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
-	ext4_fsblk_t block;
 	int depth;	/* Note, NOT eh_depth; depth from top of tree */
 	int ee_len;
 
@@ -1563,20 +1579,17 @@ static int ext4_ext_search_right(struct inode *inode,
 	 * follow it and find the closest allocated
 	 * block to the right */
 	ix++;
-	block = ext4_idx_pblock(ix);
 	while (++depth < path->p_depth) {
 		/* subtract from p_depth to get proper eh_depth */
-		bh = read_extent_tree_block(inode, block,
-					    path->p_depth - depth, 0);
+		bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
 		if (IS_ERR(bh))
 			return PTR_ERR(bh);
 		eh = ext_block_hdr(bh);
 		ix = EXT_FIRST_INDEX(eh);
-		block = ext4_idx_pblock(ix);
 		put_bh(bh);
 	}
 
-	bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+	bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
 	eh = ext_block_hdr(bh);
@@ -2955,9 +2968,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 			ext_debug(inode, "move to level %d (block %llu)\n",
 				  i + 1, ext4_idx_pblock(path[i].p_idx));
 			memset(path + i + 1, 0, sizeof(*path));
-			bh = read_extent_tree_block(inode,
-				ext4_idx_pblock(path[i].p_idx), depth - i - 1,
-				EXT4_EX_NOCACHE);
+			bh = read_extent_tree_block(inode, path[i].p_idx,
+						    depth - i - 1,
+						    EXT4_EX_NOCACHE);
 			if (IS_ERR(bh)) {
 				/* should we reset i_size? */
 				err = PTR_ERR(bh);
-- 
Gitee


From ca97a933d7c51e5f9b453c7a7622756868945091 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 15 Nov 2021 19:40:37 +0800
Subject: [PATCH 049/134] ext4: prevent partial update of the extent blocks

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

In the most error path of current extents updating operations are not
roll back partial updates properly when some bad things happens(.e.g in
ext4_ext_insert_extent()). So we may get an inconsistent extents tree
if journal has been aborted due to IO error, which may probability lead
to BUGON later when we accessing these extent entries in errors=continue
mode. This patch drop extent buffer's verify flag before updatng the
contents in ext4_ext_get_access(), and reset it after updating in
__ext4_ext_dirty(). After this patch we could force to check the extent
buffer if extents tree updating was break off, make sure the extents are
consistent.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20210908120850.4012324-4-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

conflict:
	fs/ext4/extents.c
Reviewed-by: Yang Erkun <yangerkun@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 39ea4f0b240c..618675a41efb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -136,14 +136,24 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *path)
 {
+	int err = 0;
+
 	if (path->p_bh) {
 		/* path points to block */
 		BUFFER_TRACE(path->p_bh, "get_write_access");
-		return ext4_journal_get_write_access(handle, path->p_bh);
+		err = ext4_journal_get_write_access(handle, path->p_bh);
+		/*
+		 * The extent buffer's verified bit will be set again in
+		 * __ext4_ext_dirty(). We could leave an inconsistent
+		 * buffer if the extents updating procudure break off du
+		 * to some error happens, force to check it again.
+		 */
+		if (!err)
+			clear_buffer_verified(path->p_bh);
 	}
 	/* path points to leaf/index in inode body */
 	/* we use in-core data, no need to protect them */
-	return 0;
+	return err;
 }
 
 /*
@@ -164,6 +174,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
 		/* path points to block */
 		err = __ext4_handle_dirty_metadata(where, line, handle,
 						   inode, path->p_bh);
+		/* Extents updating done, re-set verified flag */
+		if (!err)
+			set_buffer_verified(path->p_bh);
 	} else {
 		/* path points to leaf/index in inode body */
 		err = ext4_mark_inode_dirty(handle, inode);
-- 
Gitee


From 580d4c359a5c7069636b1418d1015f4b335c1fdf Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:40:38 +0800
Subject: [PATCH 050/134] ubifs: Limit dumping length by size of memory which
 is allocated for the node

mainline inclusion
from mainline-v5.11-rc1
commit c4c0d19d39d26c5f58633f8fcca75f03b2854fc0
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4c0d19d39d26c5f58633f8fcca75f03b2854fc0

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

To prevent memory out-of-bounds accessing in ubifs_dump_node(), actual
dumping length should be restricted by another condition(size of memory
which is allocated for the node).

This patch handles following situations (These situations may be caused
by bit flipping due to hardware error, writing bypass ubifs, unknown
bugs in ubifs, etc.):
1. bad node_len: Dumping data according to 'ch->len' which may exceed
   the size of memory allocated for node.
2. bad node content: Some kinds of node can record additional data, eg.
   index node and orphan node, make sure the size of additional data
   not beyond the node length.
3. node_type changes: Read data according to type A, but expected type
   B, before that, node is allocated according to type B's size. Length
   of type A node is greater than type B node.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/debug.c | 63 ++++++++++++++++++++++++++++++++++++++----------
 fs/ubifs/debug.h |  3 ++-
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ebff43f8009c..1cfc2855673e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -291,9 +291,9 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
 	kfree(pdent);
 }
 
-void ubifs_dump_node(const struct ubifs_info *c, const void *node)
+void ubifs_dump_node(const struct ubifs_info *c, const void *node, int node_len)
 {
-	int i, n;
+	int i, n, type, safe_len, max_node_len, min_node_len;
 	union ubifs_key key;
 	const struct ubifs_ch *ch = node;
 	char key_buf[DBG_KEY_BUF_LEN];
@@ -306,10 +306,40 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 		return;
 	}
 
+	/* Skip dumping unknown type node */
+	type = ch->node_type;
+	if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
+		pr_err("node type %d was not recognized\n", type);
+		return;
+	}
+
 	spin_lock(&dbg_lock);
 	dump_ch(node);
 
-	switch (ch->node_type) {
+	if (c->ranges[type].max_len == 0) {
+		max_node_len = min_node_len = c->ranges[type].len;
+	} else {
+		max_node_len = c->ranges[type].max_len;
+		min_node_len = c->ranges[type].min_len;
+	}
+	safe_len = le32_to_cpu(ch->len);
+	safe_len = safe_len > 0 ? safe_len : 0;
+	safe_len = min3(safe_len, max_node_len, node_len);
+	if (safe_len < min_node_len) {
+		pr_err("node len(%d) is too short for %s, left %d bytes:\n",
+		       safe_len, dbg_ntype(type),
+		       safe_len > UBIFS_CH_SZ ?
+		       safe_len - (int)UBIFS_CH_SZ : 0);
+		if (safe_len > UBIFS_CH_SZ)
+			print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,
+				       (void *)node + UBIFS_CH_SZ,
+				       safe_len - UBIFS_CH_SZ, 0);
+		goto out_unlock;
+	}
+	if (safe_len != le32_to_cpu(ch->len))
+		pr_err("\ttruncated node length      %d\n", safe_len);
+
+	switch (type) {
 	case UBIFS_PAD_NODE:
 	{
 		const struct ubifs_pad_node *pad = node;
@@ -453,7 +483,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 		pr_err("\tnlen           %d\n", nlen);
 		pr_err("\tname           ");
 
-		if (nlen > UBIFS_MAX_NLEN)
+		if (nlen > UBIFS_MAX_NLEN ||
+		    nlen > safe_len - UBIFS_DENT_NODE_SZ)
 			pr_err("(bad name length, not printing, bad or corrupted node)");
 		else {
 			for (i = 0; i < nlen && dent->name[i]; i++)
@@ -467,7 +498,6 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 	case UBIFS_DATA_NODE:
 	{
 		const struct ubifs_data_node *dn = node;
-		int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
 
 		key_read(c, &dn->key, &key);
 		pr_err("\tkey            %s\n",
@@ -475,10 +505,13 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 		pr_err("\tsize           %u\n", le32_to_cpu(dn->size));
 		pr_err("\tcompr_typ      %d\n",
 		       (int)le16_to_cpu(dn->compr_type));
-		pr_err("\tdata size      %d\n", dlen);
-		pr_err("\tdata:\n");
+		pr_err("\tdata size      %u\n",
+		       le32_to_cpu(ch->len) - (unsigned int)UBIFS_DATA_NODE_SZ);
+		pr_err("\tdata (length = %d):\n",
+		       safe_len - (int)UBIFS_DATA_NODE_SZ);
 		print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
-			       (void *)&dn->data, dlen, 0);
+			       (void *)&dn->data,
+			       safe_len - (int)UBIFS_DATA_NODE_SZ, 0);
 		break;
 	}
 	case UBIFS_TRUN_NODE:
@@ -495,9 +528,12 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 	case UBIFS_IDX_NODE:
 	{
 		const struct ubifs_idx_node *idx = node;
+		int max_child_cnt = (safe_len - UBIFS_IDX_NODE_SZ) /
+				    (ubifs_idx_node_sz(c, 1) -
+				    UBIFS_IDX_NODE_SZ);
 
-		n = le16_to_cpu(idx->child_cnt);
-		pr_err("\tchild_cnt      %d\n", n);
+		n = min_t(int, le16_to_cpu(idx->child_cnt), max_child_cnt);
+		pr_err("\tchild_cnt      %d\n", (int)le16_to_cpu(idx->child_cnt));
 		pr_err("\tlevel          %d\n", (int)le16_to_cpu(idx->level));
 		pr_err("\tBranches:\n");
 
@@ -525,7 +561,7 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 				le64_to_cpu(orph->cmt_no) & LLONG_MAX);
 		pr_err("\tlast node flag %llu\n",
 		       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
-		n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		n = (safe_len - UBIFS_ORPH_NODE_SZ) >> 3;
 		pr_err("\t%d orphan inode numbers:\n", n);
 		for (i = 0; i < n; i++)
 			pr_err("\t  ino %llu\n",
@@ -537,9 +573,10 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 		break;
 	}
 	default:
-		pr_err("node type %d was not recognized\n",
-		       (int)ch->node_type);
+		pr_err("node type %d was not recognized\n", type);
 	}
+
+out_unlock:
 	spin_unlock(&dbg_lock);
 }
 
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 7763639a426b..42610fa5f3a7 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -242,7 +242,8 @@ const char *dbg_get_key_dump(const struct ubifs_info *c,
 const char *dbg_snprintf_key(const struct ubifs_info *c,
 			     const union ubifs_key *key, char *buffer, int len);
 void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode);
-void ubifs_dump_node(const struct ubifs_info *c, const void *node);
+void ubifs_dump_node(const struct ubifs_info *c, const void *node,
+		     int node_len);
 void ubifs_dump_budget_req(const struct ubifs_budget_req *req);
 void ubifs_dump_lstats(const struct ubifs_lp_stats *lst);
 void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
-- 
Gitee


From 1a34c8287e19dce74a3eabed6476eca2b16edd44 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:40:39 +0800
Subject: [PATCH 051/134] Revert "ubifs: Fix out-of-bounds memory access caused
 by abnormal value of node_len"

mainline inclusion
from mainline-v5.11-rc1
commit c8be097530a82e004f98378c3afc5cd35efc4f57
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c8be097530a82e004f98378c3afc5cd35efc4f57

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

This reverts commit acc5af3efa30 ("ubifs: Fix out-of-bounds memory access caused by abnormal value of node_len")

No need to avoid memory oob in dumping for data node alone. Later, node
length will be passed into function 'ubifs_dump_node()' which replaces
all node dumping places.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/io.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index eae9cf5a57b0..d71aabd992fe 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -225,7 +225,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		     int offs, int quiet, int must_chk_crc)
 {
-	int err = -EINVAL, type, node_len, dump_node = 1;
+	int err = -EINVAL, type, node_len;
 	uint32_t crc, node_crc, magic;
 	const struct ubifs_ch *ch = buf;
 
@@ -278,22 +278,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 out_len:
 	if (!quiet)
 		ubifs_err(c, "bad node length %d", node_len);
-	if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ)
-		dump_node = 0;
 out:
 	if (!quiet) {
 		ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
-		if (dump_node) {
-			ubifs_dump_node(c, buf);
-		} else {
-			int safe_len = min3(node_len, c->leb_size - offs,
-				(int)UBIFS_MAX_DATA_NODE_SZ);
-			pr_err("\tprevent out-of-bounds memory access\n");
-			pr_err("\ttruncated data node length      %d\n", safe_len);
-			pr_err("\tcorrupted data node:\n");
-			print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
-					buf, safe_len, 0);
-		}
+		ubifs_dump_node(c, buf);
 		dump_stack();
 	}
 	return err;
-- 
Gitee


From 2a1a0937b60cca64d048e33570045f817fd61068 Mon Sep 17 00:00:00 2001
From: Chengsong Ke <kechengsong@huawei.com>
Date: Mon, 15 Nov 2021 19:40:40 +0800
Subject: [PATCH 052/134] ubifs: Remove the redundant return in
 dbg_check_nondata_nodes_order

mainline inclusion
from mainline-v5.11-rc1
commit 32f6ccc743b89bb4c51d4a868ffdb6ebda2909cf
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=32f6ccc743b89bb4c51d4a868ffdb6ebda2909cf

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

There is a redundant return in dbg_check_nondata_nodes_order,
which will be never reached. In addition, error code should be
returned instead of zero in this branch.

Signed-off-by: Chengsong Ke <kechengsong@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/debug.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 1cfc2855673e..e4cd67508b07 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2479,7 +2479,6 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
 	ubifs_msg(c, "dumping second node");
 	ubifs_dump_node(c, sb->node);
 	return -EINVAL;
-	return 0;
 }
 
 static inline int chance(unsigned int n, unsigned int out_of)
-- 
Gitee


From 553e5179469fe0b5a1483ab58b54561b206620ff Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:40:41 +0800
Subject: [PATCH 053/134] ubifs: Pass node length in all node dumping callers

mainline inclusion
from mainline-v5.11-rc1
commit a33e30a0e023e9d1866866ca895c7789f48445e7
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a33e30a0e023e9d1866866ca895c7789f48445e7

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

Function ubifs_dump_node() has been modified to avoid memory oob
accessing while dumping node, node length (corresponding to the
size of allocated memory for node) should be passed into all node
dumping callers.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/commit.c   |  4 ++--
 fs/ubifs/debug.c    | 30 +++++++++++++++---------------
 fs/ubifs/file.c     |  2 +-
 fs/ubifs/io.c       | 23 +++++++++++------------
 fs/ubifs/journal.c  |  3 ++-
 fs/ubifs/master.c   |  4 ++--
 fs/ubifs/orphan.c   |  6 ++++--
 fs/ubifs/recovery.c |  6 +++---
 fs/ubifs/replay.c   |  4 ++--
 fs/ubifs/sb.c       |  2 +-
 fs/ubifs/scan.c     |  4 ++--
 fs/ubifs/super.c    |  2 +-
 fs/ubifs/tnc.c      |  8 ++++----
 fs/ubifs/tnc_misc.c |  4 ++--
 fs/ubifs/ubifs.h    |  4 ++--
 15 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b5cdac9b0368..c4fc1047fc07 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -701,13 +701,13 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 
 out_dump:
 	ubifs_err(c, "dumping index node (iip=%d)", i->iip);
-	ubifs_dump_node(c, idx);
+	ubifs_dump_node(c, idx, ubifs_idx_node_sz(c, c->fanout));
 	list_del(&i->list);
 	kfree(i);
 	if (!list_empty(&list)) {
 		i = list_entry(list.prev, struct idx_node, list);
 		ubifs_err(c, "dumping parent index node");
-		ubifs_dump_node(c, &i->idx);
+		ubifs_dump_node(c, &i->idx, ubifs_idx_node_sz(c, c->fanout));
 	}
 out_free:
 	while (!list_empty(&list)) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e4cd67508b07..927d880bef17 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -871,7 +871,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
 		cond_resched();
 		pr_err("Dumping node at LEB %d:%d len %d\n", lnum,
 		       snod->offs, snod->len);
-		ubifs_dump_node(c, snod->node);
+		ubifs_dump_node(c, snod->node, c->leb_size - snod->offs);
 	}
 
 	pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
@@ -1249,7 +1249,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 		ubifs_err(c, "but it should have key %s according to tnc",
 			  dbg_snprintf_key(c, &zbr1->key, key_buf,
 					   DBG_KEY_BUF_LEN));
-		ubifs_dump_node(c, dent1);
+		ubifs_dump_node(c, dent1, UBIFS_MAX_DENT_NODE_SZ);
 		goto out_free;
 	}
 
@@ -1261,7 +1261,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 		ubifs_err(c, "but it should have key %s according to tnc",
 			  dbg_snprintf_key(c, &zbr2->key, key_buf,
 					   DBG_KEY_BUF_LEN));
-		ubifs_dump_node(c, dent2);
+		ubifs_dump_node(c, dent2, UBIFS_MAX_DENT_NODE_SZ);
 		goto out_free;
 	}
 
@@ -1280,9 +1280,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 			  dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
 
 	ubifs_msg(c, "first node at %d:%d\n", zbr1->lnum, zbr1->offs);
-	ubifs_dump_node(c, dent1);
+	ubifs_dump_node(c, dent1, UBIFS_MAX_DENT_NODE_SZ);
 	ubifs_msg(c, "second node at %d:%d\n", zbr2->lnum, zbr2->offs);
-	ubifs_dump_node(c, dent2);
+	ubifs_dump_node(c, dent2, UBIFS_MAX_DENT_NODE_SZ);
 
 out_free:
 	kfree(dent2);
@@ -2147,7 +2147,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 
 out_dump:
 	ubifs_msg(c, "dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
-	ubifs_dump_node(c, node);
+	ubifs_dump_node(c, node, zbr->len);
 out_free:
 	kfree(node);
 	return err;
@@ -2280,7 +2280,7 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
 
 	ubifs_msg(c, "dump of the inode %lu sitting in LEB %d:%d",
 		  (unsigned long)fscki->inum, zbr->lnum, zbr->offs);
-	ubifs_dump_node(c, ino);
+	ubifs_dump_node(c, ino, zbr->len);
 	kfree(ino);
 	return -EINVAL;
 }
@@ -2351,12 +2351,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
 
 		if (sa->type != UBIFS_DATA_NODE) {
 			ubifs_err(c, "bad node type %d", sa->type);
-			ubifs_dump_node(c, sa->node);
+			ubifs_dump_node(c, sa->node, c->leb_size - sa->offs);
 			return -EINVAL;
 		}
 		if (sb->type != UBIFS_DATA_NODE) {
 			ubifs_err(c, "bad node type %d", sb->type);
-			ubifs_dump_node(c, sb->node);
+			ubifs_dump_node(c, sb->node, c->leb_size - sb->offs);
 			return -EINVAL;
 		}
 
@@ -2387,8 +2387,8 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
 	return 0;
 
 error_dump:
-	ubifs_dump_node(c, sa->node);
-	ubifs_dump_node(c, sb->node);
+	ubifs_dump_node(c, sa->node, c->leb_size - sa->offs);
+	ubifs_dump_node(c, sb->node, c->leb_size - sb->offs);
 	return -EINVAL;
 }
 
@@ -2419,13 +2419,13 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
 		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
 		    sa->type != UBIFS_XENT_NODE) {
 			ubifs_err(c, "bad node type %d", sa->type);
-			ubifs_dump_node(c, sa->node);
+			ubifs_dump_node(c, sa->node, c->leb_size - sa->offs);
 			return -EINVAL;
 		}
 		if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE &&
 		    sb->type != UBIFS_XENT_NODE) {
 			ubifs_err(c, "bad node type %d", sb->type);
-			ubifs_dump_node(c, sb->node);
+			ubifs_dump_node(c, sb->node, c->leb_size - sb->offs);
 			return -EINVAL;
 		}
 
@@ -2475,9 +2475,9 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
 
 error_dump:
 	ubifs_msg(c, "dumping first node");
-	ubifs_dump_node(c, sa->node);
+	ubifs_dump_node(c, sa->node, c->leb_size - sa->offs);
 	ubifs_msg(c, "dumping second node");
-	ubifs_dump_node(c, sb->node);
+	ubifs_dump_node(c, sb->node, c->leb_size - sb->offs);
 	return -EINVAL;
 }
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f4826b6da682..d44c8e14810c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -92,7 +92,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
 dump:
 	ubifs_err(c, "bad data node (block %u, inode %lu)",
 		  block, inode->i_ino);
-	ubifs_dump_node(c, dn);
+	ubifs_dump_node(c, dn, UBIFS_MAX_DATA_NODE_SZ);
 	return -EINVAL;
 }
 
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d71aabd992fe..937bd3fdad40 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -198,6 +198,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
  * ubifs_check_node - check node.
  * @c: UBIFS file-system description object
  * @buf: node to check
+ * @len: node length
  * @lnum: logical eraseblock number
  * @offs: offset within the logical eraseblock
  * @quiet: print no messages
@@ -222,8 +223,8 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
  * This function returns zero in case of success and %-EUCLEAN in case of bad
  * CRC or magic.
  */
-int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int must_chk_crc)
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len,
+		     int lnum, int offs, int quiet, int must_chk_crc)
 {
 	int err = -EINVAL, type, node_len;
 	uint32_t crc, node_crc, magic;
@@ -281,7 +282,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 out:
 	if (!quiet) {
 		ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
-		ubifs_dump_node(c, buf);
+		ubifs_dump_node(c, buf, len);
 		dump_stack();
 	}
 	return err;
@@ -718,7 +719,7 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 {
 	struct ubifs_info *c = wbuf->c;
-	int err, written, n, aligned_len = ALIGN(len, 8);
+	int err, n, written = 0, aligned_len = ALIGN(len, 8);
 
 	dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
 	       dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -785,8 +786,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 		goto exit;
 	}
 
-	written = 0;
-
 	if (wbuf->used) {
 		/*
 		 * The node is large enough and does not fit entirely within
@@ -887,7 +886,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 out:
 	ubifs_err(c, "cannot write %d bytes to LEB %d:%d, error %d",
 		  len, wbuf->lnum, wbuf->offs, err);
-	ubifs_dump_node(c, buf);
+	ubifs_dump_node(c, buf, written + len);
 	dump_stack();
 	ubifs_dump_leb(c, wbuf->lnum);
 	return err;
@@ -930,7 +929,7 @@ int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum,
 
 	err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
 	if (err)
-		ubifs_dump_node(c, buf);
+		ubifs_dump_node(c, buf, len);
 
 	return err;
 }
@@ -1013,7 +1012,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 		goto out;
 	}
 
-	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
+	err = ubifs_check_node(c, buf, len, lnum, offs, 0, 0);
 	if (err) {
 		ubifs_err(c, "expected node type %d", type);
 		return err;
@@ -1029,7 +1028,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 
 out:
 	ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
-	ubifs_dump_node(c, buf);
+	ubifs_dump_node(c, buf, len);
 	dump_stack();
 	return -EINVAL;
 }
@@ -1069,7 +1068,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
 		goto out;
 	}
 
-	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
+	err = ubifs_check_node(c, buf, len, lnum, offs, 0, 0);
 	if (err) {
 		ubifs_errc(c, "expected node type %d", type);
 		return err;
@@ -1087,7 +1086,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
 	ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum,
 		   offs, ubi_is_mapped(c->ubi, lnum));
 	if (!c->probing) {
-		ubifs_dump_node(c, buf);
+		ubifs_dump_node(c, buf, len);
 		dump_stack();
 	}
 	return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 7274bd23881b..230717384a38 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1560,7 +1560,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 			if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) {
 				ubifs_err(c, "bad data node (block %u, inode %lu)",
 					  blk, inode->i_ino);
-				ubifs_dump_node(c, dn);
+				ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ -
+						UBIFS_TRUN_NODE_SZ);
 				goto out_free;
 			}
 
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 911d0555b9f2..0df9a3dd0aaa 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -314,7 +314,7 @@ static int validate_master(const struct ubifs_info *c)
 
 out:
 	ubifs_err(c, "bad master node at offset %d error %d", c->mst_offs, err);
-	ubifs_dump_node(c, c->mst_node);
+	ubifs_dump_node(c, c->mst_node, c->mst_node_alsz);
 	return -EINVAL;
 }
 
@@ -392,7 +392,7 @@ int ubifs_read_master(struct ubifs_info *c)
 		if (c->leb_cnt < old_leb_cnt ||
 		    c->leb_cnt < UBIFS_MIN_LEB_CNT) {
 			ubifs_err(c, "bad leb_cnt on master node");
-			ubifs_dump_node(c, c->mst_node);
+			ubifs_dump_node(c, c->mst_node, c->mst_node_alsz);
 			return -EINVAL;
 		}
 
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 0fb61956146d..4909321d84cf 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -646,7 +646,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		if (snod->type != UBIFS_ORPH_NODE) {
 			ubifs_err(c, "invalid node type %d in orphan area at %d:%d",
 				  snod->type, sleb->lnum, snod->offs);
-			ubifs_dump_node(c, snod->node);
+			ubifs_dump_node(c, snod->node,
+					c->leb_size - snod->offs);
 			err = -EINVAL;
 			goto out_free;
 		}
@@ -674,7 +675,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 			if (!first) {
 				ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d",
 					  cmt_no, sleb->lnum, snod->offs);
-				ubifs_dump_node(c, snod->node);
+				ubifs_dump_node(c, snod->node,
+						c->leb_size - snod->offs);
 				err = -EINVAL;
 				goto out_free;
 			}
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f116f7b3f9e5..f0d51dd21c9e 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -352,11 +352,11 @@ int ubifs_recover_master_node(struct ubifs_info *c)
 	ubifs_err(c, "failed to recover master node");
 	if (mst1) {
 		ubifs_err(c, "dumping first master node");
-		ubifs_dump_node(c, mst1);
+		ubifs_dump_node(c, mst1, c->leb_size - ((void *)mst1 - buf1));
 	}
 	if (mst2) {
 		ubifs_err(c, "dumping second master node");
-		ubifs_dump_node(c, mst2);
+		ubifs_dump_node(c, mst2, c->leb_size - ((void *)mst2 - buf2));
 	}
 	vfree(buf2);
 	vfree(buf1);
@@ -469,7 +469,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
 	 * The area after the common header size is not empty, so the common
 	 * header must be intact. Check it.
 	 */
-	if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+	if (ubifs_check_node(c, buf, len, lnum, offs, 1, 0) != -EUCLEAN) {
 		dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
 		return 0;
 	}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index b2f5563d1489..6283adeb21e0 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -830,7 +830,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
 
 out_dump:
 	ubifs_err(c, "bad node is at LEB %d:%d", lnum, snod->offs);
-	ubifs_dump_node(c, snod->node);
+	ubifs_dump_node(c, snod->node, c->leb_size - snod->offs);
 	ubifs_scan_destroy(sleb);
 	return -EINVAL;
 }
@@ -1126,7 +1126,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 out_dump:
 	ubifs_err(c, "log error detected while replaying the log at LEB %d:%d",
 		  lnum, offs + snod->offs);
-	ubifs_dump_node(c, snod->node);
+	ubifs_dump_node(c, snod->node, c->leb_size - snod->offs);
 	ubifs_scan_destroy(sleb);
 	return -EINVAL;
 }
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index c0d3e4008d23..c160f718c288 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -503,7 +503,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
 
 failed:
 	ubifs_err(c, "bad superblock, error %d", err);
-	ubifs_dump_node(c, sup);
+	ubifs_dump_node(c, sup, ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size));
 	return -EINVAL;
 }
 
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index c69cdb5e65bc..84a9157dcc32 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -76,7 +76,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
 	dbg_scan("scanning %s at LEB %d:%d",
 		 dbg_ntype(ch->node_type), lnum, offs);
 
-	if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
+	if (ubifs_check_node(c, buf, len, lnum, offs, quiet, 1))
 		return SCANNED_A_CORRUPT_NODE;
 
 	if (ch->node_type == UBIFS_PAD_NODE) {
@@ -90,7 +90,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
 			if (!quiet) {
 				ubifs_err(c, "bad pad node at LEB %d:%d",
 					  lnum, offs);
-				ubifs_dump_node(c, pad);
+				ubifs_dump_node(c, pad, len);
 			}
 			return SCANNED_A_BAD_PAD_NODE;
 		}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cfd46753a685..98cbd9cac246 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -253,7 +253,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 
 out_invalid:
 	ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err);
-	ubifs_dump_node(c, ino);
+	ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ);
 	ubifs_dump_inode(c, inode);
 	err = -EINVAL;
 out_ino:
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 894f1ab14616..414266d4d1ba 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -316,7 +316,7 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 	err = ubifs_validate_entry(c, dent);
 	if (err) {
 		dump_stack();
-		ubifs_dump_node(c, dent);
+		ubifs_dump_node(c, dent, zbr->len);
 		return err;
 	}
 
@@ -349,7 +349,7 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 	err = ubifs_validate_entry(c, node);
 	if (err) {
 		dump_stack();
-		ubifs_dump_node(c, node);
+		ubifs_dump_node(c, node, zbr->len);
 		return err;
 	}
 
@@ -1699,7 +1699,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
 		goto out_err;
 	}
 
-	err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0);
+	err = ubifs_check_node(c, buf, zbr->len, zbr->lnum, zbr->offs, 0, 0);
 	if (err) {
 		ubifs_err(c, "expected node type %d", UBIFS_DATA_NODE);
 		goto out;
@@ -1733,7 +1733,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
 	err = -EINVAL;
 out:
 	ubifs_err(c, "bad node at LEB %d:%d", zbr->lnum, zbr->offs);
-	ubifs_dump_node(c, buf);
+	ubifs_dump_node(c, buf, zbr->len);
 	dump_stack();
 	return err;
 }
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index ccaf94ea5be3..5517a56b4138 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -390,7 +390,7 @@ static int read_znode(struct ubifs_info *c, struct ubifs_zbranch *zzbr,
 
 out_dump:
 	ubifs_err(c, "bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
-	ubifs_dump_node(c, idx);
+	ubifs_dump_node(c, idx, c->max_idx_node_sz);
 	kfree(idx);
 	return -EINVAL;
 }
@@ -489,7 +489,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 			  zbr->lnum, zbr->offs);
 		dbg_tnck(key, "looked for key ");
 		dbg_tnck(&key1, "but found node's key ");
-		ubifs_dump_node(c, node);
+		ubifs_dump_node(c, node, zbr->len);
 		return -EINVAL;
 	}
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e7e48f3b179a..daad22263cb2 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1721,8 +1721,8 @@ int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
 		     int offs);
 int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum,
 			  int offs, int hmac_offs);
-int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int must_chk_crc);
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len,
+		     int lnum, int offs, int quiet, int must_chk_crc);
 void ubifs_init_node(struct ubifs_info *c, void *buf, int len, int pad);
 void ubifs_crc_node(struct ubifs_info *c, void *buf, int len);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
-- 
Gitee


From b08b792e75345d7d13ac39d5145c0c73f3ebd135 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:40:42 +0800
Subject: [PATCH 054/134] ubifs: ubifs_dump_sleb: Remove unused function

mainline inclusion
from mainline-v5.11-rc1
commit bf6dab7a6ce79c56764623b970be10fc6edd8a68
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bf6dab7a6ce79c56764623b970be10fc6edd8a68

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

Function ubifs_dump_sleb() is defined but unused, it can be removed.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/debug.c | 16 ----------------
 fs/ubifs/debug.h |  2 --
 2 files changed, 18 deletions(-)

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 927d880bef17..1c376ff0ec13 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -828,22 +828,6 @@ void ubifs_dump_lpt_info(struct ubifs_info *c)
 	spin_unlock(&dbg_lock);
 }
 
-void ubifs_dump_sleb(const struct ubifs_info *c,
-		     const struct ubifs_scan_leb *sleb, int offs)
-{
-	struct ubifs_scan_node *snod;
-
-	pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n",
-	       current->pid, sleb->lnum, offs);
-
-	list_for_each_entry(snod, &sleb->nodes, list) {
-		cond_resched();
-		pr_err("Dumping node at LEB %d:%d len %d\n",
-		       sleb->lnum, snod->offs, snod->len);
-		ubifs_dump_node(c, snod->node);
-	}
-}
-
 void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
 {
 	struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 42610fa5f3a7..ed966108da80 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -252,8 +252,6 @@ void ubifs_dump_lprop(const struct ubifs_info *c,
 void ubifs_dump_lprops(struct ubifs_info *c);
 void ubifs_dump_lpt_info(struct ubifs_info *c);
 void ubifs_dump_leb(const struct ubifs_info *c, int lnum);
-void ubifs_dump_sleb(const struct ubifs_info *c,
-		     const struct ubifs_scan_leb *sleb, int offs);
 void ubifs_dump_znode(const struct ubifs_info *c,
 		      const struct ubifs_znode *znode);
 void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
-- 
Gitee


From f04e6c6154181e0acf0606da4968ffa9289bb7ef Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:40:43 +0800
Subject: [PATCH 055/134] ubifs: ubifs_dump_node: Dump all branches of the
 index node

mainline inclusion
from mainline-v5.11-rc1
commit b80a974b8c58164ed57b0f025a47b8f003198d9e
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b80a974b8c58164ed57b0f025a47b8f003198d9e

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

An index node can have up to c->fanout branches, all branches should be
displayed while dumping index node.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 1c376ff0ec13..89320c89cf0d 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -537,7 +537,7 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node, int node_len)
 		pr_err("\tlevel          %d\n", (int)le16_to_cpu(idx->level));
 		pr_err("\tBranches:\n");
 
-		for (i = 0; i < n && i < c->fanout - 1; i++) {
+		for (i = 0; i < n && i < c->fanout; i++) {
 			const struct ubifs_branch *br;
 
 			br = ubifs_idx_branch(c, idx, i);
-- 
Gitee


From 7f2c55ca1934820d4b30568484436b6e8196582a Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:42:59 +0800
Subject: [PATCH 056/134] nbd: don't handle response without a corresponding
 request message

mainline inclusion
from mainline-next-20211018
commit b5644a3a79bf3be5f1238db1b2f241374b27b0f0
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b5644a3a79bf3be5f1238db1b2f241374b27b0f0

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

While handling a response message from server, nbd_read_stat() will
try to get request by tag, and then complete the request. However,
this is problematic if nbd haven't sent a corresponding request
message:

t1                      t2
                        submit_bio
                         nbd_queue_rq
                          blk_mq_start_request
recv_work
 nbd_read_stat
  blk_mq_tag_to_rq
 blk_mq_complete_request
                          nbd_send_cmd

Thus add a new cmd flag 'NBD_CMD_INFLIGHT', it will be set in
nbd_send_cmd() and checked in nbd_read_stat().

Noted that this patch can't fix that blk_mq_tag_to_rq() might
return a freed request, and this will be fixed in following
patches.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210916093350.1410403-2-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f4b0637bb155..1b3718238588 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -122,6 +122,12 @@ struct nbd_device {
 };
 
 #define NBD_CMD_REQUEUED	1
+/*
+ * This flag will be set if nbd_queue_rq() succeed, and will be checked and
+ * cleared in completion. Both setting and clearing of the flag are protected
+ * by cmd->lock.
+ */
+#define NBD_CMD_INFLIGHT	2
 
 struct nbd_cmd {
 	struct nbd_device *nbd;
@@ -389,6 +395,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 	if (!mutex_trylock(&cmd->lock))
 		return BLK_EH_RESET_TIMER;
 
+	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		cmd->status = BLK_STS_TIMEOUT;
 		mutex_unlock(&cmd->lock);
@@ -718,6 +725,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 	cmd = blk_mq_rq_to_pdu(req);
 
 	mutex_lock(&cmd->lock);
+	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
+			tag, cmd->status, cmd->flags);
+		ret = -ENOENT;
+		goto out;
+	}
 	if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
 		dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
 			req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
@@ -817,6 +830,7 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 		return true;
 
 	mutex_lock(&cmd->lock);
+	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
 	cmd->status = BLK_STS_IOERR;
 	mutex_unlock(&cmd->lock);
 
@@ -953,7 +967,13 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 	 * returns EAGAIN can be retried on a different socket.
 	 */
 	ret = nbd_send_cmd(nbd, cmd, index);
-	if (ret == -EAGAIN) {
+	/*
+	 * Access to this flag is protected by cmd->lock, thus it's safe to set
+	 * the flag after nbd_send_cmd() succeed to send request to server.
+	 */
+	if (!ret)
+		__set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+	else if (ret == -EAGAIN) {
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Request send failed, requeueing\n");
 		nbd_mark_nsock_dead(nbd, nsock, 1);
-- 
Gitee


From 4438c56e17d903d94c438144d7993a641f0cb28d Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:43:00 +0800
Subject: [PATCH 057/134] nbd: make sure request completion won't concurrent

mainline inclusion
from mainline-next-20211018
commit d14b304f558f8c8f53da3a8d0c0b671f14a9c2f4
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d14b304f558f8c8f53da3a8d0c0b671f14a9c2f4

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

commit cddce0116058 ("nbd: Aovid double completion of a request")
try to fix that nbd_clear_que() and recv_work() can complete a
request concurrently. However, the problem still exists:

t1                    t2                     t3

nbd_disconnect_and_put
 flush_workqueue
                      recv_work
                       blk_mq_complete_request
                        blk_mq_complete_request_remote -> this is true
                         WRITE_ONCE(rq->state, MQ_RQ_COMPLETE)
                          blk_mq_raise_softirq
                                             blk_done_softirq
                                              blk_complete_reqs
                                               nbd_complete_rq
                                                blk_mq_end_request
                                                 blk_mq_free_request
                                                  WRITE_ONCE(rq->state, MQ_RQ_IDLE)
  nbd_clear_que
   blk_mq_tagset_busy_iter
    nbd_clear_req
                                                   __blk_mq_free_request
                                                    blk_mq_put_tag
     blk_mq_complete_request -> complete again

There are three places where request can be completed in nbd:
recv_work(), nbd_clear_que() and nbd_xmit_timeout(). Since they
all hold cmd->lock before completing the request, it's easy to
avoid the problem by setting and checking a cmd flag.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210916093350.1410403-3-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 1b3718238588..7271541558b6 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -395,7 +395,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 	if (!mutex_trylock(&cmd->lock))
 		return BLK_EH_RESET_TIMER;
 
-	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		mutex_unlock(&cmd->lock);
+		return BLK_EH_DONE;
+	}
+
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		cmd->status = BLK_STS_TIMEOUT;
 		mutex_unlock(&cmd->lock);
@@ -830,7 +834,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 		return true;
 
 	mutex_lock(&cmd->lock);
-	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		mutex_unlock(&cmd->lock);
+		return true;
+	}
 	cmd->status = BLK_STS_IOERR;
 	mutex_unlock(&cmd->lock);
 
-- 
Gitee


From f966f133626d57a2a7889b2ff02e2edac7712b2a Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:43:01 +0800
Subject: [PATCH 058/134] nbd: check sock index in nbd_read_stat()

mainline inclusion
from mainline-next-20211018
commit dbd73178da676945d8bbcf6afe731623f683ce0a
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dbd73178da676945d8bbcf6afe731623f683ce0a

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

The sock that clent send request in nbd_send_cmd() and receive reply
in nbd_read_stat() should be the same.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210916093350.1410403-4-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7271541558b6..f3747cdf5dd1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -735,6 +735,10 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 		ret = -ENOENT;
 		goto out;
 	}
+	if (cmd->index != index) {
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
+			tag, index, cmd->index);
+	}
 	if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
 		dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
 			req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
-- 
Gitee


From fbbe3c9e906b0581edc4f28dcdcb65a4a7d47b11 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:43:02 +0800
Subject: [PATCH 059/134] nbd: don't start request if nbd_queue_rq() failed

mainline inclusion
from mainline-next-20211018
commit a83fdc85365586dc5c0f3ff91680e18e37a66f19
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a83fdc85365586dc5c0f3ff91680e18e37a66f19

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

commit 6a468d5990ec ("nbd: don't start req until after the dead
connection logic") move blk_mq_start_request() from nbd_queue_rq()
to nbd_handle_cmd() to skip starting request if the connection is
dead. However, request is still started in other error paths.

Currently, blk_mq_end_request() will be called immediately if
nbd_queue_rq() failed, thus start request in such situation is
useless. So remove blk_mq_start_request() from error paths in
nbd_handle_cmd().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210916093350.1410403-5-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f3747cdf5dd1..11a936151b93 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -923,7 +923,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Socks array is empty\n");
-		blk_mq_start_request(req);
 		return -EINVAL;
 	}
 	config = nbd->config;
@@ -932,7 +931,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Attempted send on invalid socket\n");
 		nbd_config_put(nbd);
-		blk_mq_start_request(req);
 		return -EINVAL;
 	}
 	cmd->status = BLK_STS_OK;
@@ -956,7 +954,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 			 */
 			sock_shutdown(nbd);
 			nbd_config_put(nbd);
-			blk_mq_start_request(req);
 			return -EIO;
 		}
 		goto again;
-- 
Gitee


From a4720fbe0b1ad1dd170a7506fe6052d21ca9de16 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:43:03 +0800
Subject: [PATCH 060/134] nbd: clean up return value checking of sock_xmit()

mainline inclusion
from mainline-next-20211018
commit 6157a8f489909db00151a4e361903b9099b03b75
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6157a8f489909db00151a4e361903b9099b03b75

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Check if sock_xmit() return 0 is useless because it'll never return
0, comment it and remove such checkings.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210916093350.1410403-6-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 11a936151b93..8bfade1c4ee6 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -479,7 +479,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 }
 
 /*
- *  Send or receive packet.
+ *  Send or receive packet. Return a positive value on success and
+ *  negtive value on failue, and never return 0.
  */
 static int sock_xmit(struct nbd_device *nbd, int index, int send,
 		     struct iov_iter *iter, int msg_flags, int *sent)
@@ -605,7 +606,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	result = sock_xmit(nbd, index, 1, &from,
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
 	trace_nbd_header_sent(req, handle);
-	if (result <= 0) {
+	if (result < 0) {
 		if (was_interrupted(result)) {
 			/* If we havne't sent anything we can just return BUSY,
 			 * however if we have sent something we need to make
@@ -649,7 +650,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 				skip = 0;
 			}
 			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
-			if (result <= 0) {
+			if (result < 0) {
 				if (was_interrupted(result)) {
 					/* We've already sent the header, we
 					 * have no choice but to set pending and
@@ -701,7 +702,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 	reply.magic = 0;
 	iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
 	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
-	if (result <= 0) {
+	if (result < 0) {
 		if (!nbd_disconnected(config))
 			dev_err(disk_to_dev(nbd->disk),
 				"Receive control failed (result %d)\n", result);
@@ -772,7 +773,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 		rq_for_each_segment(bvec, req, iter) {
 			iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
 			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
-			if (result <= 0) {
+			if (result < 0) {
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
 				/*
@@ -1218,7 +1219,7 @@ static void send_disconnects(struct nbd_device *nbd)
 		iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
 		mutex_lock(&nsock->tx_lock);
 		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
-		if (ret <= 0)
+		if (ret < 0)
 			dev_err(disk_to_dev(nbd->disk),
 				"Send disconnect failed %d\n", ret);
 		mutex_unlock(&nsock->tx_lock);
-- 
Gitee


From 2eb290002f0b6598c53c4c0269e8aca35c852735 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:43:04 +0800
Subject: [PATCH 061/134] nbd: partition nbd_read_stat() into nbd_read_reply()
 and nbd_handle_reply()

mainline inclusion
from mainline-next-20211018
commit 961e9f50be9bb47835b0ac7e08d55d2d0a45e493
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=961e9f50be9bb47835b0ac7e08d55d2d0a45e493

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Prepare to fix uaf in nbd_read_stat(), no functional changes.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20210916093350.1410403-7-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 74 +++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 8bfade1c4ee6..622ea52ebbdd 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -684,38 +684,45 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	return 0;
 }
 
-/* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
+static int nbd_read_reply(struct nbd_device *nbd, int index,
+			  struct nbd_reply *reply)
 {
-	struct nbd_config *config = nbd->config;
-	int result;
-	struct nbd_reply reply;
-	struct nbd_cmd *cmd;
-	struct request *req = NULL;
-	u64 handle;
-	u16 hwq;
-	u32 tag;
-	struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
+	struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
 	struct iov_iter to;
-	int ret = 0;
+	int result;
 
-	reply.magic = 0;
-	iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
+	reply->magic = 0;
+	iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply));
 	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 	if (result < 0) {
-		if (!nbd_disconnected(config))
+		if (!nbd_disconnected(nbd->config))
 			dev_err(disk_to_dev(nbd->disk),
 				"Receive control failed (result %d)\n", result);
-		return ERR_PTR(result);
+		return result;
 	}
 
-	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
+	if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
 		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
-				(unsigned long)ntohl(reply.magic));
-		return ERR_PTR(-EPROTO);
+				(unsigned long)ntohl(reply->magic));
+		return -EPROTO;
 	}
 
-	memcpy(&handle, reply.handle, sizeof(handle));
+	return 0;
+}
+
+/* NULL returned = something went wrong, inform userspace */
+static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
+					struct nbd_reply *reply)
+{
+	int result;
+	struct nbd_cmd *cmd;
+	struct request *req = NULL;
+	u64 handle;
+	u16 hwq;
+	u32 tag;
+	int ret = 0;
+
+	memcpy(&handle, reply->handle, sizeof(handle));
 	tag = nbd_handle_to_tag(handle);
 	hwq = blk_mq_unique_tag_to_hwq(tag);
 	if (hwq < nbd->tag_set.nr_hw_queues)
@@ -758,9 +765,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 		ret = -ENOENT;
 		goto out;
 	}
-	if (ntohl(reply.error)) {
+	if (ntohl(reply->error)) {
 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
-			ntohl(reply.error));
+			ntohl(reply->error));
 		cmd->status = BLK_STS_IOERR;
 		goto out;
 	}
@@ -769,6 +776,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 	if (rq_data_dir(req) != WRITE) {
 		struct req_iterator iter;
 		struct bio_vec bvec;
+		struct iov_iter to;
 
 		rq_for_each_segment(bvec, req, iter) {
 			iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
@@ -782,7 +790,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 				 * and let the timeout stuff handle resubmitting
 				 * this request onto another connection.
 				 */
-				if (nbd_disconnected(config)) {
+				if (nbd_disconnected(nbd->config)) {
 					cmd->status = BLK_STS_IOERR;
 					goto out;
 				}
@@ -806,24 +814,30 @@ static void recv_work(struct work_struct *work)
 						     work);
 	struct nbd_device *nbd = args->nbd;
 	struct nbd_config *config = nbd->config;
+	struct nbd_sock *nsock;
 	struct nbd_cmd *cmd;
 	struct request *rq;
 
 	while (1) {
-		cmd = nbd_read_stat(nbd, args->index);
-		if (IS_ERR(cmd)) {
-			struct nbd_sock *nsock = config->socks[args->index];
+		struct nbd_reply reply;
 
-			mutex_lock(&nsock->tx_lock);
-			nbd_mark_nsock_dead(nbd, nsock, 1);
-			mutex_unlock(&nsock->tx_lock);
+		if (nbd_read_reply(nbd, args->index, &reply))
+			break;
+
+		cmd = nbd_handle_reply(nbd, args->index, &reply);
+		if (IS_ERR(cmd))
 			break;
-		}
 
 		rq = blk_mq_rq_from_pdu(cmd);
 		if (likely(!blk_should_fake_timeout(rq->q)))
 			blk_mq_complete_request(rq);
 	}
+
+	nsock = config->socks[args->index];
+	mutex_lock(&nsock->tx_lock);
+	nbd_mark_nsock_dead(nbd, nsock, 1);
+	mutex_unlock(&nsock->tx_lock);
+
 	nbd_config_put(nbd);
 	atomic_dec(&config->recv_threads);
 	wake_up(&config->recv_wq);
-- 
Gitee


From bf70aab1e49bbf60ea0731736dea81f307c4a3a3 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:43:05 +0800
Subject: [PATCH 062/134] nbd: fix uaf in nbd_handle_reply()

mainline inclusion
from mainline-next-20211018
commit 52c90e0184f67eecb00b53b79bfdf75e0274f8fd
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=52c90e0184f67eecb00b53b79bfdf75e0274f8fd

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

There is a problem that nbd_handle_reply() might access freed request:

1) At first, a normal io is submitted and completed with scheduler:

internal_tag = blk_mq_get_tag -> get tag from sched_tags
 blk_mq_rq_ctx_init
  sched_tags->rq[internal_tag] = sched_tag->static_rq[internal_tag]
...
blk_mq_get_driver_tag
 __blk_mq_get_driver_tag -> get tag from tags
 tags->rq[tag] = sched_tag->static_rq[internal_tag]

So, both tags->rq[tag] and sched_tags->rq[internal_tag] are pointing
to the request: sched_tags->static_rq[internal_tag]. Even if the
io is finished.

2) nbd server send a reply with random tag directly:

recv_work
 nbd_handle_reply
  blk_mq_tag_to_rq(tags, tag)
   rq = tags->rq[tag]

3) if the sched_tags->static_rq is freed:

blk_mq_sched_free_requests
 blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i)
  -> step 2) access rq before clearing rq mapping
  blk_mq_clear_rq_mapping(set, tags, hctx_idx);
  __free_pages() -> rq is freed here

4) Then, nbd continue to use the freed request in nbd_handle_reply

Fix the problem by get 'q_usage_counter' before blk_mq_tag_to_rq(),
thus request is ensured not to be freed because 'q_usage_counter' is
not zero.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20210916141810.2325276-1-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 622ea52ebbdd..051ba02f996f 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -814,6 +814,7 @@ static void recv_work(struct work_struct *work)
 						     work);
 	struct nbd_device *nbd = args->nbd;
 	struct nbd_config *config = nbd->config;
+	struct request_queue *q = nbd->disk->queue;
 	struct nbd_sock *nsock;
 	struct nbd_cmd *cmd;
 	struct request *rq;
@@ -824,13 +825,28 @@ static void recv_work(struct work_struct *work)
 		if (nbd_read_reply(nbd, args->index, &reply))
 			break;
 
+		/*
+		 * Grab .q_usage_counter so request pool won't go away, then no
+		 * request use-after-free is possible during nbd_handle_reply().
+		 * If queue is frozen, there won't be any inflight requests, we
+		 * needn't to handle the incoming garbage message.
+		 */
+		if (!percpu_ref_tryget(&q->q_usage_counter)) {
+			dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
+				__func__);
+			break;
+		}
+
 		cmd = nbd_handle_reply(nbd, args->index, &reply);
-		if (IS_ERR(cmd))
+		if (IS_ERR(cmd)) {
+			percpu_ref_put(&q->q_usage_counter);
 			break;
+		}
 
 		rq = blk_mq_rq_from_pdu(cmd);
 		if (likely(!blk_should_fake_timeout(rq->q)))
 			blk_mq_complete_request(rq);
+		percpu_ref_put(&q->q_usage_counter);
 	}
 
 	nsock = config->socks[args->index];
-- 
Gitee


From 4b423d41e21c49e97bb37de9224e85c5053090ad Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 15 Nov 2021 19:49:02 +0800
Subject: [PATCH 063/134] ext4: avoid recheck extent for EXT4_EX_FORCE_CACHE

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Buffer with verified means that it has been checked before. No need
verify and call set_buffer_verified again.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 618675a41efb..a77e25ca6867 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -544,13 +544,16 @@ __read_extent_tree_block(const char *function, unsigned int line,
 		if (err < 0)
 			goto errout;
 	}
-	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
-		return bh;
-	err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
-			       depth, pblk, le32_to_cpu(idx->ei_block));
-	if (err)
-		goto errout;
-	set_buffer_verified(bh);
+	if (buffer_verified(bh)) {
+		if (!(flags & EXT4_EX_FORCE_CACHE))
+			return bh;
+	} else {
+		err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+				       depth, pblk, le32_to_cpu(idx->ei_block));
+		if (err)
+			goto errout;
+		set_buffer_verified(bh);
+	}
 	/*
 	 * If this is a leaf block, cache all of its entries
 	 */
-- 
Gitee


From 20fb37766c29df27bf3b80bb39b0d5863c21643d Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 15 Nov 2021 19:49:03 +0800
Subject: [PATCH 064/134] ext4: check magic even the extent block bh is
 verified

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Our stress testing with IO error can trigger follow OOB with a very low
probability.

[59898.282466] BUG: KASAN: slab-out-of-bounds in ext4_find_extent+0x2e4/0x480
...
[59898.287162] Call Trace:
[59898.287575]  dump_stack+0x8b/0xb9
[59898.288070]  print_address_description+0x73/0x280
[59898.289903]  ext4_find_extent+0x2e4/0x480
[59898.290553]  ext4_ext_map_blocks+0x125/0x1470
[59898.295481]  ext4_map_blocks+0x5ee/0x940
[59898.315984]  ext4_mpage_readpages+0x63c/0xdb0
[59898.320231]  read_pages+0xe6/0x370
[59898.321589]  __do_page_cache_readahead+0x233/0x2a0
[59898.321594]  ondemand_readahead+0x157/0x450
[59898.321598]  generic_file_read_iter+0xcb2/0x1550
[59898.328828]  __vfs_read+0x233/0x360
[59898.328840]  vfs_read+0xa5/0x190
[59898.330126]  ksys_read+0xa5/0x150
[59898.331405]  do_syscall_64+0x6d/0x1f0
[59898.331418]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Digging deep and we found it's actually a xattr block which can happened
with follow steps:

1. extent update for file1 and will remove a leaf extent block(block A)
2. we need update the idx extent block too
3. block A has been allocated as a xattr block and will set verified
3. io error happened for this idx block and will the buffer has been
   released late
4. extent find for file1 will read the idx block and see block A again
5. since the buffer of block A is already verified, we will use it
   directly, which can lead the upper OOB

Same as __ext4_xattr_check_block, we can check magic even the buffer is
verified to fix the problem.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a77e25ca6867..74b69d83c198 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -545,6 +545,14 @@ __read_extent_tree_block(const char *function, unsigned int line,
 			goto errout;
 	}
 	if (buffer_verified(bh)) {
+		if (unlikely(ext_block_hdr(bh)->eh_magic != EXT4_EXT_MAGIC)) {
+			err = -EFSCORRUPTED;
+			ext4_error_inode(inode, function, line, 0,
+				"invalid magic for verified extent block %llu",
+				(unsigned long long)bh->b_blocknr);
+			goto errout;
+		}
+
 		if (!(flags & EXT4_EX_FORCE_CACHE))
 			return bh;
 	} else {
-- 
Gitee


From 4e2cb35a73aa5033f461926533cd96128253ecaa Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 15 Nov 2021 19:49:04 +0800
Subject: [PATCH 065/134] scsi: scsi_debug: Fix out-of-bound read in
 resp_readcap16()

mainline inclusion
from mainline-v5.17
commit 4e3ace0051e7e504b55d239daab8789dd89b863c
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4e3ace0051e7e504b55d239daab8789dd89b863c

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

The following warning was observed running syzkaller:

[ 3813.830724] sg_write: data in/out 65466/242 bytes for SCSI command 0x9e-- guessing data in;
[ 3813.830724]    program syz-executor not setting count and/or reply_len properly
[ 3813.836956] ==================================================================
[ 3813.839465] BUG: KASAN: stack-out-of-bounds in sg_copy_buffer+0x157/0x1e0
[ 3813.841773] Read of size 4096 at addr ffff8883cf80f540 by task syz-executor/1549
[ 3813.846612] Call Trace:
[ 3813.846995]  dump_stack+0x108/0x15f
[ 3813.847524]  print_address_description+0xa5/0x372
[ 3813.848243]  kasan_report.cold+0x236/0x2a8
[ 3813.849439]  check_memory_region+0x240/0x270
[ 3813.850094]  memcpy+0x30/0x80
[ 3813.850553]  sg_copy_buffer+0x157/0x1e0
[ 3813.853032]  sg_copy_from_buffer+0x13/0x20
[ 3813.853660]  fill_from_dev_buffer+0x135/0x370
[ 3813.854329]  resp_readcap16+0x1ac/0x280
[ 3813.856917]  schedule_resp+0x41f/0x1630
[ 3813.858203]  scsi_debug_queuecommand+0xb32/0x17e0
[ 3813.862699]  scsi_dispatch_cmd+0x330/0x950
[ 3813.863329]  scsi_request_fn+0xd8e/0x1710
[ 3813.863946]  __blk_run_queue+0x10b/0x230
[ 3813.864544]  blk_execute_rq_nowait+0x1d8/0x400
[ 3813.865220]  sg_common_write.isra.0+0xe61/0x2420
[ 3813.871637]  sg_write+0x6c8/0xef0
[ 3813.878853]  __vfs_write+0xe4/0x800
[ 3813.883487]  vfs_write+0x17b/0x530
[ 3813.884008]  ksys_write+0x103/0x270
[ 3813.886268]  __x64_sys_write+0x77/0xc0
[ 3813.886841]  do_syscall_64+0x106/0x360
[ 3813.887415]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

This issue can be reproduced with the following syzkaller log:

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x26e1, 0x0)
r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='fd/3\x00')
open_by_handle_at(r1, &(0x7f00000003c0)=ANY=[@ANYRESHEX], 0x602000)
r2 = syz_open_dev$sg(&(0x7f0000000000), 0x0, 0x40782)
write$binfmt_aout(r2, &(0x7f0000000340)=ANY=[@ANYBLOB="00000000deff000000000000000000000000000000000000000000000000000047f007af9e107a41ec395f1bded7be24277a1501ff6196a83366f4e6362bc0ff2b247f68a972989b094b2da4fb3607fcf611a22dd04310d28c75039d"], 0x126)

In resp_readcap16() we get "int alloc_len" value -1104926854, and then pass
the huge arr_len to fill_from_dev_buffer(), but arr is only 32 bytes. This
leads to OOB in sg_copy_buffer().

To solve this issue, define alloc_len as u32.

Link: https://lore.kernel.org/r/20211013033913.2551004-2-yebin10@huawei.com
Acked-by: Douglas Gilbert <dgilbert@interlog.com>
Signed-off-by: Ye Bin <yebin10@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/scsi/scsi_debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index b6540b92f566..63504dc63d87 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -1855,7 +1855,7 @@ static int resp_readcap16(struct scsi_cmnd *scp,
 {
 	unsigned char *cmd = scp->cmnd;
 	unsigned char arr[SDEBUG_READCAP16_ARR_SZ];
-	int alloc_len;
+	u32 alloc_len;
 
 	alloc_len = get_unaligned_be32(cmd + 10);
 	/* following just in case virtual_gb changed */
@@ -1884,7 +1884,7 @@ static int resp_readcap16(struct scsi_cmnd *scp,
 	}
 
 	return fill_from_dev_buffer(scp, arr,
-			    min_t(int, alloc_len, SDEBUG_READCAP16_ARR_SZ));
+			    min_t(u32, alloc_len, SDEBUG_READCAP16_ARR_SZ));
 }
 
 #define SDEBUG_MAX_TGTPGS_ARR_SZ 1412
-- 
Gitee


From 87bbda91aeb83a6c601ef805a8315098b2fcd8a2 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 15 Nov 2021 19:49:05 +0800
Subject: [PATCH 066/134] scsi: scsi_debug: Fix out-of-bound read in
 resp_report_tgtpgs()

mainline inclusion
from mainline-v5.17
commit f347c26836c270199de1599c3cd466bb7747caa9
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f347c26836c270199de1599c3cd466bb7747caa9

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

The following issue was observed running syzkaller:

BUG: KASAN: slab-out-of-bounds in memcpy include/linux/string.h:377 [inline]
BUG: KASAN: slab-out-of-bounds in sg_copy_buffer+0x150/0x1c0 lib/scatterlist.c:831
Read of size 2132 at addr ffff8880aea95dc8 by task syz-executor.0/9815

CPU: 0 PID: 9815 Comm: syz-executor.0 Not tainted 4.19.202-00874-gfc0fe04215a9 #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xe4/0x14a lib/dump_stack.c:118
 print_address_description+0x73/0x280 mm/kasan/report.c:253
 kasan_report_error mm/kasan/report.c:352 [inline]
 kasan_report+0x272/0x370 mm/kasan/report.c:410
 memcpy+0x1f/0x50 mm/kasan/kasan.c:302
 memcpy include/linux/string.h:377 [inline]
 sg_copy_buffer+0x150/0x1c0 lib/scatterlist.c:831
 fill_from_dev_buffer+0x14f/0x340 drivers/scsi/scsi_debug.c:1021
 resp_report_tgtpgs+0x5aa/0x770 drivers/scsi/scsi_debug.c:1772
 schedule_resp+0x464/0x12f0 drivers/scsi/scsi_debug.c:4429
 scsi_debug_queuecommand+0x467/0x1390 drivers/scsi/scsi_debug.c:5835
 scsi_dispatch_cmd+0x3fc/0x9b0 drivers/scsi/scsi_lib.c:1896
 scsi_request_fn+0x1042/0x1810 drivers/scsi/scsi_lib.c:2034
 __blk_run_queue_uncond block/blk-core.c:464 [inline]
 __blk_run_queue+0x1a4/0x380 block/blk-core.c:484
 blk_execute_rq_nowait+0x1c2/0x2d0 block/blk-exec.c:78
 sg_common_write.isra.19+0xd74/0x1dc0 drivers/scsi/sg.c:847
 sg_write.part.23+0x6e0/0xd00 drivers/scsi/sg.c:716
 sg_write+0x64/0xa0 drivers/scsi/sg.c:622
 __vfs_write+0xed/0x690 fs/read_write.c:485
kill_bdev:block_device:00000000e138492c
 vfs_write+0x184/0x4c0 fs/read_write.c:549
 ksys_write+0x107/0x240 fs/read_write.c:599
 do_syscall_64+0xc2/0x560 arch/x86/entry/common.c:293
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

We get 'alen' from command its type is int. If userspace passes a large
length we will get a negative 'alen'.

Switch n, alen, and rlen to u32.

Link: https://lore.kernel.org/r/20211013033913.2551004-3-yebin10@huawei.com
Acked-by: Douglas Gilbert <dgilbert@interlog.com>
Signed-off-by: Ye Bin <yebin10@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/scsi/scsi_debug.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 63504dc63d87..3fc7c2a31c19 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -1895,8 +1895,9 @@ static int resp_report_tgtpgs(struct scsi_cmnd *scp,
 	unsigned char *cmd = scp->cmnd;
 	unsigned char *arr;
 	int host_no = devip->sdbg_host->shost->host_no;
-	int n, ret, alen, rlen;
 	int port_group_a, port_group_b, port_a, port_b;
+	u32 alen, n, rlen;
+	int ret;
 
 	alen = get_unaligned_be32(cmd + 6);
 	arr = kzalloc(SDEBUG_MAX_TGTPGS_ARR_SZ, GFP_ATOMIC);
@@ -1958,9 +1959,9 @@ static int resp_report_tgtpgs(struct scsi_cmnd *scp,
 	 * - The constructed command length
 	 * - The maximum array size
 	 */
-	rlen = min_t(int, alen, n);
+	rlen = min(alen, n);
 	ret = fill_from_dev_buffer(scp, arr,
-			   min_t(int, rlen, SDEBUG_MAX_TGTPGS_ARR_SZ));
+			   min_t(u32, rlen, SDEBUG_MAX_TGTPGS_ARR_SZ));
 	kfree(arr);
 	return ret;
 }
-- 
Gitee


From d04b23f6319bca4a54c288cdfdbf0a9c28dc9496 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 15 Nov 2021 19:49:06 +0800
Subject: [PATCH 067/134] PM: hibernate: Get block device exclusively in
 swsusp_check()

mainline inclusion
from mainline-v5.17
commit 39fbef4b0f77f9c89c8f014749ca533643a37c9f
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=39fbef4b0f77f9c89c8f014749ca533643a37c9f

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

The following kernel crash can be triggered:

[   89.266592] ------------[ cut here ]------------
[   89.267427] kernel BUG at fs/buffer.c:3020!
[   89.268264] invalid opcode: 0000 [#1] SMP KASAN PTI
[   89.269116] CPU: 7 PID: 1750 Comm: kmmpd-loop0 Not tainted 5.10.0-862.14.0.6.x86_64-08610-gc932cda3cef4-dirty #20
[   89.273169] RIP: 0010:submit_bh_wbc.isra.0+0x538/0x6d0
[   89.277157] RSP: 0018:ffff888105ddfd08 EFLAGS: 00010246
[   89.278093] RAX: 0000000000000005 RBX: ffff888124231498 RCX: ffffffffb2772612
[   89.279332] RDX: 1ffff11024846293 RSI: 0000000000000008 RDI: ffff888124231498
[   89.280591] RBP: ffff8881248cc000 R08: 0000000000000001 R09: ffffed1024846294
[   89.281851] R10: ffff88812423149f R11: ffffed1024846293 R12: 0000000000003800
[   89.283095] R13: 0000000000000001 R14: 0000000000000000 R15: ffff8881161f7000
[   89.284342] FS:  0000000000000000(0000) GS:ffff88839b5c0000(0000) knlGS:0000000000000000
[   89.285711] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   89.286701] CR2: 00007f166ebc01a0 CR3: 0000000435c0e000 CR4: 00000000000006e0
[   89.287919] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   89.289138] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   89.290368] Call Trace:
[   89.290842]  write_mmp_block+0x2ca/0x510
[   89.292218]  kmmpd+0x433/0x9a0
[   89.294902]  kthread+0x2dd/0x3e0
[   89.296268]  ret_from_fork+0x22/0x30
[   89.296906] Modules linked in:

by running the following commands:

 1. mkfs.ext4 -O mmp  /dev/sda -b 1024
 2. mount /dev/sda /home/test
 3. echo "/dev/sda" > /sys/power/resume

That happens because swsusp_check() calls set_blocksize() on the
target partition which confuses the file system:

       Thread1                       Thread2
mount /dev/sda /home/test
get s_mmp_bh  --> has mapped flag
start kmmpd thread
				echo "/dev/sda" > /sys/power/resume
				  resume_store
				    software_resume
				      swsusp_check
				        set_blocksize
					  truncate_inode_pages_range
					    truncate_cleanup_page
					      block_invalidatepage
					        discard_buffer --> clean mapped flag
write_mmp_block
  submit_bh
    submit_bh_wbc
      BUG_ON(!buffer_mapped(bh))

To address this issue, modify swsusp_check() to open the target block
device with exclusive access.

Signed-off-by: Ye Bin <yebin10@huawei.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/power/swap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 72e33054a2e1..c9126606fa6f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1521,9 +1521,10 @@ int swsusp_read(unsigned int *flags_p)
 int swsusp_check(void)
 {
 	int error;
+	void *holder;
 
 	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
-					    FMODE_READ, NULL);
+					    FMODE_READ | FMODE_EXCL, &holder);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
@@ -1545,7 +1546,7 @@ int swsusp_check(void)
 
 put:
 		if (error)
-			blkdev_put(hib_resume_bdev, FMODE_READ);
+			blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL);
 		else
 			pr_debug("Image signature found, resuming\n");
 	} else {
-- 
Gitee


From ef9d20b69f202dc9003424ebe9fdb10df41ed52f Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 15 Nov 2021 19:49:07 +0800
Subject: [PATCH 068/134] nbd: Fix use-after-free in pid_show

mainline inclusion
from mainline-v5.16
commit 0c98057be9efa32de78dbc4685fc73da9d71faa1
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c98057be9efa32de78dbc4685fc73da9d71faa1

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

I got issue as follows:
[  263.886511] BUG: KASAN: use-after-free in pid_show+0x11f/0x13f
[  263.888359] Read of size 4 at addr ffff8880bf0648c0 by task cat/746
[  263.890479] CPU: 0 PID: 746 Comm: cat Not tainted 4.19.90-dirty #140
[  263.893162] Call Trace:
[  263.893509]  dump_stack+0x108/0x15f
[  263.893999]  print_address_description+0xa5/0x372
[  263.894641]  kasan_report.cold+0x236/0x2a8
[  263.895696]  __asan_report_load4_noabort+0x25/0x30
[  263.896365]  pid_show+0x11f/0x13f
[  263.897422]  dev_attr_show+0x48/0x90
[  263.898361]  sysfs_kf_seq_show+0x24d/0x4b0
[  263.899479]  kernfs_seq_show+0x14e/0x1b0
[  263.900029]  seq_read+0x43f/0x1150
[  263.900499]  kernfs_fop_read+0xc7/0x5a0
[  263.903764]  vfs_read+0x113/0x350
[  263.904231]  ksys_read+0x103/0x270
[  263.905230]  __x64_sys_read+0x77/0xc0
[  263.906284]  do_syscall_64+0x106/0x360
[  263.906797]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reproduce this issue as follows:
1. nbd-server 8000 /tmp/disk
2. nbd-client localhost 8000 /dev/nbd1
3. cat /sys/block/nbd1/pid
Then trigger use-after-free in pid_show.

Reason is after do step '2', nbd-client progress is already exit. So
it's task_struct already freed.
To solve this issue, revert part of 6521d39a64b3's modify and remove
useless 'recv_task' member of nbd_device.

Fixes: 6521d39a64b3 ("nbd: Remove variable 'pid'")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20211020073959.2679255-1-yebin10@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

conflicts:
drivers/block/nbd.c

Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 051ba02f996f..f3896f1c27f4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -114,11 +114,11 @@ struct nbd_device {
 	struct workqueue_struct *recv_workq;
 
 	struct list_head list;
-	struct task_struct *task_recv;
 	struct task_struct *task_setup;
 
 	struct completion *destroy_complete;
 	unsigned long flags;
+	pid_t pid; /* pid of nbd-client, if attached */
 };
 
 #define NBD_CMD_REQUEUED	1
@@ -214,7 +214,7 @@ static ssize_t pid_show(struct device *dev,
 	struct gendisk *disk = dev_to_disk(dev);
 	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 
-	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+	return sprintf(buf, "%d\n", nbd->pid);
 }
 
 static const struct device_attribute pid_attr = {
@@ -333,7 +333,7 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
 	struct nbd_config *config = nbd->config;
 	config->blksize = blocksize;
 	config->bytesize = blocksize * nr_blocks;
-	if (nbd->task_recv != NULL)
+	if (nbd->pid)
 		nbd_size_update(nbd, false);
 }
 
@@ -1284,7 +1284,7 @@ static void nbd_config_put(struct nbd_device *nbd)
 		if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
 				       &config->runtime_flags))
 			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-		nbd->task_recv = NULL;
+		nbd->pid = 0;
 		nbd_clear_sock(nbd);
 		if (config->num_connections) {
 			int i;
@@ -1319,7 +1319,7 @@ static int nbd_start_device(struct nbd_device *nbd)
 	int num_connections = config->num_connections;
 	int error = 0, i;
 
-	if (nbd->task_recv)
+	if (nbd->pid)
 		return -EBUSY;
 	if (!config->socks)
 		return -EINVAL;
@@ -1338,7 +1338,7 @@ static int nbd_start_device(struct nbd_device *nbd)
 	}
 
 	blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
-	nbd->task_recv = current;
+	nbd->pid = task_pid_nr(current);
 
 	nbd_parse_flags(nbd);
 
@@ -1612,8 +1612,8 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
 {
 	struct nbd_device *nbd = s->private;
 
-	if (nbd->task_recv)
-		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
+	if (nbd->pid)
+		seq_printf(s, "recv: %d\n", nbd->pid);
 
 	return 0;
 }
@@ -2188,7 +2188,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
 	mutex_lock(&nbd->config_lock);
 	config = nbd->config;
 	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
-	    !nbd->task_recv) {
+	    !nbd->pid) {
 		dev_err(nbd_to_dev(nbd),
 			"not configured, cannot reconfigure\n");
 		ret = -EINVAL;
-- 
Gitee


From f6278454bd9d4d1218614d1ea809a753fc05d607 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin@huawei.com>
Date: Mon, 15 Nov 2021 19:49:08 +0800
Subject: [PATCH 069/134] Fix NULL pointer dereference in handling for
 passthrough commands

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

In passthrough path. If the command size for ioctl request from userspace
is 0. The original process will get cmd_len from cmd->cmnd, but It has
not been assigned at this time. So it will trigger a NULL pointer BUG.

------------[ cut here ]------------
BUG: kernel NULL pointer dereference, address: 0000000000000000
PF: supervisor read access in kernel mode
PF: error_code(0x0000) - not-present page
RIP: 0010:scsi_queue_rq+0xcb2/0x12b0
Call Trace:
blk_mq_dispatch_rq_list+0x541/0xe90
__blk_mq_sched_dispatch_requests+0x1fe/0x2b0
blk_mq_sched_dispatch_requests+0xbf/0x130
__blk_mq_run_hw_queue+0x15b/0x230
__blk_mq_delay_run_hw_queue+0x18f/0x320
blk_mq_run_hw_queue+0x252/0x280
blk_mq_sched_insert_request+0x228/0x260
blk_execute_rq+0x111/0x160
sg_io+0x51a/0x740
scsi_cmd_ioctl+0x533/0x910
scsi_cmd_blk_ioctl+0xa1/0xb0
cdrom_ioctl+0x3f/0x2510
sr_block_ioctl+0x142/0x180
blkdev_ioctl+0x398/0x450
block_ioctl+0x6d/0x80
__se_sys_ioctl+0xd1/0x140
__x64_sys_ioctl+0x3f/0x50
do_syscall_64+0x37/0x50
entry_SYSCALL_64_after_hwframe+0x44/0xa9

We can trigger front BUG by ioctl blow.
------------[ cut here ]------------

sg_io_hdr_t *addr;

addr = malloc(sizeof(sg_io_hdr_t));
memset(addr, 0, sizeof(sg_io_hdr_t));
addr->interface_id = 'S';

fd = open(/dev/sr0, O_RDONLY); // open a CD_ROM dev

ioctl(fd, SG_IO, addr); // all zero sg_io_hdr_t will trigger this bug

Fixes: 2ceda20f0a99a ("scsi: core: Move command size detection out of
the fast path")
Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/scsi/scsi_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d89db29fa829..b26c922bd65d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1193,9 +1193,9 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
 	}
 
 	cmd->cmd_len = scsi_req(req)->cmd_len;
+	cmd->cmnd = scsi_req(req)->cmd;
 	if (cmd->cmd_len == 0)
 		cmd->cmd_len = scsi_command_size(cmd->cmnd);
-	cmd->cmnd = scsi_req(req)->cmd;
 	cmd->transfersize = blk_rq_bytes(req);
 	cmd->allowed = scsi_req(req)->retries;
 	return BLK_STS_OK;
-- 
Gitee


From 4fa4dba9c1cbf355fdfead50c2006e5917f6055f Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin@huawei.com>
Date: Mon, 15 Nov 2021 19:49:09 +0800
Subject: [PATCH 070/134] loop: fix loop_validate_block_size() can't make sense

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

We config the block size of a loop device throuth
the following process:

lo_ioctl(..., unsigned long arg)
|                      ^^^^
lo_simple_ioctl(..., unsigned long arg)
|                             ^^^^
loop_set_block_size(..., unsigned long arg)
        |                         ^^^^
        loop_validate_block_size(unsigned short bsize)
        |                                 ^^^^^
        blk_queue_logical_block_size(..., unsigned int size) {
                '''
                limits->logical_block_size = size;
                        ^^^^ int
                '''
        }

loop_validate_block_size() will check the validity of bsize.
But long to short will be truncated(eg arg=1049600 and
it becomes 1024 after truncation by short. The block size
must within the range of 512 ~ PAGE_SZIE, This truncation will
turn illegal block szie to legal). The wrong of block size will
raise other errors.

Fixes: 3448914e8cc55 ("loop: Add LOOP_CONFIGURE ioctl")
Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f0fa0c8e7ec6..5dd8bd480e29 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -233,7 +233,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
  * @bsize: size to validate
  */
 static int
-loop_validate_block_size(unsigned short bsize)
+loop_validate_block_size(unsigned long bsize)
 {
 	if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
 		return -EINVAL;
-- 
Gitee


From daecdc92a6e81384346aea6558c1024fadcbfe49 Mon Sep 17 00:00:00 2001
From: Zheng Liang <zhengliang6@huawei.com>
Date: Mon, 15 Nov 2021 19:50:16 +0800
Subject: [PATCH 071/134] block, bfq: fix UAF problem in bfqg_stats_init()

mainline inclusion
from mainline
commit 2fc428f6b7ca80794cb9928c90d4de524366659f
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2fc428f6b7ca80794cb9928c90d4de524366659f

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

In bfq_pd_alloc(), the function bfqg_stats_init() init bfqg. If
blkg_rwstat_init() init bfqg_stats->bytes successful and init
bfqg_stats->ios failed, bfqg_stats_init() return failed, bfqg will
be freed. But blkg_rwstat->cpu_cnt is not deleted from the list of
percpu_counters. If we traverse the list of percpu_counters, It will
have UAF problem.

we should use blkg_rwstat_exit() to cleanup bfqg_stats bytes in the
above scenario.

Fixes: commit fd41e60331b ("bfq-iosched: stop using blkg->stat_bytes and ->stat_ios")
Signed-off-by: Zheng Liang <zhengliang6@huawei.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20211018024225.1493938-1-zhengliang6@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/bfq-cgroup.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b791e2041e49..a6bcc779c912 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 {
 	if (blkg_rwstat_init(&stats->bytes, gfp) ||
 	    blkg_rwstat_init(&stats->ios, gfp))
-		return -ENOMEM;
+		goto error;
 
 #ifdef CONFIG_BFQ_CGROUP_DEBUG
 	if (blkg_rwstat_init(&stats->merged, gfp) ||
@@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 	    bfq_stat_init(&stats->dequeue, gfp) ||
 	    bfq_stat_init(&stats->group_wait_time, gfp) ||
 	    bfq_stat_init(&stats->idle_time, gfp) ||
-	    bfq_stat_init(&stats->empty_time, gfp)) {
-		bfqg_stats_exit(stats);
-		return -ENOMEM;
-	}
+	    bfq_stat_init(&stats->empty_time, gfp))
+		goto error;
 #endif
 
 	return 0;
+
+error:
+	bfqg_stats_exit(stats);
+	return -ENOMEM;
 }
 
 static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
-- 
Gitee


From 7ec45656ed1dc5ccea18512c208da6e77ddab701 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 15 Nov 2021 19:50:17 +0800
Subject: [PATCH 072/134] ext4: ensure enough credits in
 ext4_ext_shift_path_extents

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Like ext4_ext_rm_leaf, we can ensure that there are enough credits
before every call that will consume credits.  As part of this fix we
fold the functionality of ext4_access_path() into
ext4_ext_shift_path_extents().  This change is needed as a preparation
for the next bugfix patch.

Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20210903062748.4118886-3-yangerkun@huawei.com
Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 49 +++++++++++++++--------------------------------
 1 file changed, 15 insertions(+), 34 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74b69d83c198..a16be67a1fe8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5014,36 +5014,6 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
 }
 
-/*
- * ext4_access_path:
- * Function to access the path buffer for marking it dirty.
- * It also checks if there are sufficient credits left in the journal handle
- * to update path.
- */
-static int
-ext4_access_path(handle_t *handle, struct inode *inode,
-		struct ext4_ext_path *path)
-{
-	int credits, err;
-
-	if (!ext4_handle_valid(handle))
-		return 0;
-
-	/*
-	 * Check if need to extend journal credits
-	 * 3 for leaf, sb, and inode plus 2 (bmap and group
-	 * descriptor) for each block group; assume two block
-	 * groups
-	 */
-	credits = ext4_writepage_trans_blocks(inode);
-	err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
-	if (err < 0)
-		return err;
-
-	err = ext4_ext_get_access(handle, inode, path);
-	return err;
-}
-
 /*
  * ext4_ext_shift_path_extents:
  * Shift the extents of a path structure lying between path[depth].p_ext
@@ -5058,6 +5028,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 	int depth, err = 0;
 	struct ext4_extent *ex_start, *ex_last;
 	bool update = false;
+	int credits, restart_credits;
 	depth = path->p_depth;
 
 	while (depth >= 0) {
@@ -5067,13 +5038,23 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 				return -EFSCORRUPTED;
 
 			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+			/* leaf + sb + inode */
+			credits = 3;
+			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+				update = true;
+				/* extent tree + sb + inode */
+				credits = depth + 2;
+			}
 
-			err = ext4_access_path(handle, inode, path + depth);
+			restart_credits = ext4_writepage_trans_blocks(inode);
+			err = ext4_datasem_ensure_credits(handle, inode, credits,
+					restart_credits, 0);
 			if (err)
 				goto out;
 
-			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
-				update = true;
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
+				goto out;
 
 			while (ex_start <= ex_last) {
 				if (SHIFT == SHIFT_LEFT) {
@@ -5104,7 +5085,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 		}
 
 		/* Update index too */
-		err = ext4_access_path(handle, inode, path + depth);
+		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			goto out;
 
-- 
Gitee


From 1de907a470a804238764822c1dee2a00100e3eab Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 15 Nov 2021 19:50:18 +0800
Subject: [PATCH 073/134] ext4: refresh the ext4_ext_path struct after dropping
 i_data_sem.

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

After we drop i_data sem, we need to reload the ext4_ext_path
structure since the extent tree can change once i_data_sem is
released.

This addresses the BUG:

[52117.465187] ------------[ cut here ]------------
[52117.465686] kernel BUG at fs/ext4/extents.c:1756!
...
[52117.478306] Call Trace:
[52117.478565]  ext4_ext_shift_extents+0x3ee/0x710
[52117.479020]  ext4_fallocate+0x139c/0x1b40
[52117.479405]  ? __do_sys_newfstat+0x6b/0x80
[52117.479805]  vfs_fallocate+0x151/0x4b0
[52117.480177]  ksys_fallocate+0x4a/0xa0
[52117.480533]  __x64_sys_fallocate+0x22/0x30
[52117.480930]  do_syscall_64+0x35/0x80
[52117.481277]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[52117.481769] RIP: 0033:0x7fa062f855ca

Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20210903062748.4118886-4-yangerkun@huawei.com
Signed-off-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a16be67a1fe8..f1548e52496c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5049,8 +5049,11 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 			restart_credits = ext4_writepage_trans_blocks(inode);
 			err = ext4_datasem_ensure_credits(handle, inode, credits,
 					restart_credits, 0);
-			if (err)
+			if (err) {
+				if (err > 0)
+					err = -EAGAIN;
 				goto out;
+			}
 
 			err = ext4_ext_get_access(handle, inode, path + depth);
 			if (err)
@@ -5124,6 +5127,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	int ret = 0, depth;
 	struct ext4_extent *extent;
 	ext4_lblk_t stop, *iterator, ex_start, ex_end;
+	ext4_lblk_t tmp = EXT_MAX_BLOCKS;
 
 	/* Let path point to the last extent */
 	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
@@ -5177,11 +5181,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	 * till we reach stop. In case of right shift, iterator points to stop
 	 * and it is decreased till we reach start.
 	 */
+again:
 	if (SHIFT == SHIFT_LEFT)
 		iterator = &start;
 	else
 		iterator = &stop;
 
+	if (tmp != EXT_MAX_BLOCKS)
+		*iterator = tmp;
+
 	/*
 	 * Its safe to start updating extents.  Start and stop are unsigned, so
 	 * in case of right shift if extent with 0 block is reached, iterator
@@ -5210,6 +5218,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 			}
 		}
 
+		tmp = *iterator;
 		if (SHIFT == SHIFT_LEFT) {
 			extent = EXT_LAST_EXTENT(path[depth].p_hdr);
 			*iterator = le32_to_cpu(extent->ee_block) +
@@ -5228,6 +5237,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 		}
 		ret = ext4_ext_shift_path_extents(path, shift, inode,
 				handle, SHIFT);
+		/* iterator can be NULL which means we should break */
+		if (ret == -EAGAIN)
+			goto again;
 		if (ret)
 			break;
 	}
-- 
Gitee


From 846ad258d3d81cac41a2b9e5f91449b81d7de615 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 15 Nov 2021 19:50:20 +0800
Subject: [PATCH 074/134] blk-mq: Introduce the BLK_MQ_F_NO_SCHED_BY_DEFAULT
 flag

mainline inclusion
from mainline-5.15-rc1
commit 90b7198001f23ea37d3b46dc631bdaa2357a20b1
category: performance
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=90b7198001f23ea37d3b46dc631bdaa2357a20b1

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

elevator_get_default() uses the following algorithm to select an I/O
scheduler from inside add_disk():
- In case of a single hardware queue or if sharing hardware queues across
  multiple request queues (BLK_MQ_F_TAG_HCTX_SHARED), use mq-deadline.
- Otherwise, use 'none'.

This is a good choice for most but not for all block drivers. Make it
possible to override the selection of mq-deadline with a new flag,
namely BLK_MQ_F_NO_SCHED_BY_DEFAULT.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Martijn Coenen <maco@android.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20210805174200.3250718-2-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflicts:
	block/elevator.c

Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/elevator.c       | 3 +++
 include/linux/blk-mq.h | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/block/elevator.c b/block/elevator.c
index 2a525863d4e9..4ce6b22813a1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -624,6 +624,9 @@ static inline bool elv_support_iosched(struct request_queue *q)
  */
 static struct elevator_type *elevator_get_default(struct request_queue *q)
 {
+	if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+		return NULL;
+
 	if (q->nr_hw_queues != 1)
 		return NULL;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 92bbc9a72355..eee2c8a16601 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -399,7 +399,13 @@ enum {
 	BLK_MQ_F_STACKING	= 1 << 2,
 	BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
+	/* Do not allow an I/O scheduler to be configured. */
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
+	/*
+	 * Select 'none' during queue registration in case of a single hwq
+	 * or shared hwqs instead of 'mq-deadline'.
+	 */
+	BLK_MQ_F_NO_SCHED_BY_DEFAULT	= 1 << 7,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
-- 
Gitee


From df8dbd6a2d544b655f74b385a8d18f9a5f78b15d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 15 Nov 2021 19:50:21 +0800
Subject: [PATCH 075/134] loop: Select I/O scheduler 'none' from inside
 add_disk()

mainline inclusion
from mainline-5.15-rc1
commit 2112f5c1330a671fa852051d85cb9eadc05d7eb7
category: performance
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2112f5c1330a671fa852051d85cb9eadc05d7eb7

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

We noticed that the user interface of Android devices becomes very slow
under memory pressure. This is because Android uses the zram driver on top
of the loop driver for swapping, because under memory pressure the swap
code alternates reads and writes quickly, because mq-deadline is the
default scheduler for loop devices and because mq-deadline delays writes by
five seconds for such a workload with default settings. Fix this by making
the kernel select I/O scheduler 'none' from inside add_disk() for loop
devices. This default can be overridden at any time from user space,
e.g. via a udev rule. This approach has an advantage compared to changing
the I/O scheduler from userspace from 'mq-deadline' into 'none', namely
that synchronize_rcu() does not get called.

This patch changes the default I/O scheduler for loop devices from
'mq-deadline' into 'none'.

Additionally, this patch reduces the Android boot time on my test setup
with 0.5 seconds compared to configuring the loop I/O scheduler from user
space.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Martijn Coenen <maco@android.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20210805174200.3250718-3-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/loop.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5dd8bd480e29..84f3191f4566 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2117,7 +2117,8 @@ static int loop_add(struct loop_device **l, int i)
 	lo->tag_set.queue_depth = 128;
 	lo->tag_set.numa_node = NUMA_NO_NODE;
 	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
+	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING |
+		BLK_MQ_F_NO_SCHED_BY_DEFAULT;
 	lo->tag_set.driver_data = lo;
 
 	err = blk_mq_alloc_tag_set(&lo->tag_set);
-- 
Gitee


From 79c8108f41abe1c508986a3aceced9884809cc8e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 15 Nov 2021 19:50:22 +0800
Subject: [PATCH 076/134] ext4: if zeroout fails fall back to splitting the
 extent node

mainline inclusion
from mainline-5.15-rc1
commit 308c57ccf4318236be75dfa251c84713e694457b
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=308c57ccf4318236be75dfa251c84713e694457b

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

If the underlying storage device is using thin-provisioning, it's
possible for a zeroout operation to return ENOSPC.

Commit df22291ff0fd ("ext4: Retry block allocation if we have free blocks
left") added logic to retry block allocation since we might get free block
after we commit a transaction. But the ENOSPC from thin-provisioning
will confuse ext4, and lead to an infinite loop.

Since using zeroout instead of splitting the extent node is an
optimization, if it fails, we might as well fall back to splitting the
extent node.

Reported-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/extents.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f1548e52496c..398df993ccd8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3612,7 +3612,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				split_map.m_len - ee_block);
 			err = ext4_ext_zeroout(inode, &zero_ex1);
 			if (err)
-				goto out;
+				goto fallback;
 			split_map.m_len = allocated;
 		}
 		if (split_map.m_lblk - ee_block + split_map.m_len <
@@ -3626,7 +3626,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 						      ext4_ext_pblock(ex));
 				err = ext4_ext_zeroout(inode, &zero_ex2);
 				if (err)
-					goto out;
+					goto fallback;
 			}
 
 			split_map.m_len += split_map.m_lblk - ee_block;
@@ -3635,6 +3635,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		}
 	}
 
+fallback:
 	err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
 				flags);
 	if (err > 0)
-- 
Gitee


From d811d9768edcb4cd1ae013f2bcd27f20b979fef5 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 15 Nov 2021 19:50:23 +0800
Subject: [PATCH 077/134] ovl: fix use after free in struct ovl_aio_req

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Example for triggering use after free in a overlay on ext4 setup:

aio_read
  ovl_read_iter
    vfs_iter_read
      ext4_file_read_iter
        ext4_dio_read_iter
          iomap_dio_rw -> -EIOCBQUEUED
          /*
	   * Here IO is completed in a separate thread,
	   * ovl_aio_cleanup_handler() frees aio_req which has iocb embedded
	   */
          file_accessed(iocb->ki_filp); /**BOOM**/

Fix by introducing a refcount in ovl_aio_req similarly to aio_kiocb.  This
guarantees that iocb is only freed after vfs_read/write_iter() returns on
underlying fs.

Fixes: 2406a307ac7d ("ovl: implement async IO routines")
Signed-off-by: yangerkun <yangerkun@huawei.com>
Link: https://lore.kernel.org/r/20210930032228.3199690-3-yangerkun@huawei.com/
Cc: <stable@vger.kernel.org> # v5.6
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/overlayfs/file.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index f7135777cb4e..d1ae643a999a 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -17,6 +17,7 @@
 
 struct ovl_aio_req {
 	struct kiocb iocb;
+	refcount_t ref;
 	struct kiocb *orig_iocb;
 	struct fd fd;
 };
@@ -257,6 +258,14 @@ static rwf_t ovl_iocb_to_rwf(int ifl)
 	return flags;
 }
 
+static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
+{
+	if (refcount_dec_and_test(&aio_req->ref)) {
+		fdput(aio_req->fd);
+		kmem_cache_free(ovl_aio_request_cachep, aio_req);
+	}
+}
+
 static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
 {
 	struct kiocb *iocb = &aio_req->iocb;
@@ -273,8 +282,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
 	}
 
 	orig_iocb->ki_pos = iocb->ki_pos;
-	fdput(aio_req->fd);
-	kmem_cache_free(ovl_aio_request_cachep, aio_req);
+	ovl_aio_put(aio_req);
 }
 
 static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
@@ -324,7 +332,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		aio_req->orig_iocb = iocb;
 		kiocb_clone(&aio_req->iocb, iocb, real.file);
 		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+		refcount_set(&aio_req->ref, 2);
 		ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+		ovl_aio_put(aio_req);
 		if (ret != -EIOCBQUEUED)
 			ovl_aio_cleanup_handler(aio_req);
 	}
@@ -395,7 +405,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		kiocb_clone(&aio_req->iocb, iocb, real.file);
 		aio_req->iocb.ki_flags = ifl;
 		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+		refcount_set(&aio_req->ref, 2);
 		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+		ovl_aio_put(aio_req);
 		if (ret != -EIOCBQUEUED)
 			ovl_aio_cleanup_handler(aio_req);
 	}
-- 
Gitee


From c3f9ad79e2f317e9a72e7e541d5b3624a7025c05 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 15 Nov 2021 19:50:24 +0800
Subject: [PATCH 078/134] net: make free_netdev() more lenient with
 unregistering devices

mainline inclusion
from mainline-v5.11-rc4
commit c269a24ce057abfc31130960e96ab197ef6ab196
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c269a24ce057abfc31130960e96ab197ef6ab196

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

There are two flavors of handling netdev registration:
 - ones called without holding rtnl_lock: register_netdev() and
   unregister_netdev(); and
 - those called with rtnl_lock held: register_netdevice() and
   unregister_netdevice().

While the semantics of the former are pretty clear, the same can't
be said about the latter. The netdev_todo mechanism is utilized to
perform some of the device unregistering tasks and it hooks into
rtnl_unlock() so the locked variants can't actually finish the work.
In general free_netdev() does not mix well with locked calls. Most
drivers operating under rtnl_lock set dev->needs_free_netdev to true
and expect core to make the free_netdev() call some time later.

The part where this becomes most problematic is error paths. There is
no way to unwind the state cleanly after a call to register_netdevice(),
since unreg can't be performed fully without dropping locks.

Make free_netdev() more lenient, and defer the freeing if device
is being unregistered. This allows error paths to simply call
free_netdev() both after register_netdevice() failed, and after
a call to unregister_netdevice() but before dropping rtnl_lock.

Simplify the error paths which are currently doing gymnastics
around free_netdev() handling.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Xu Jia <xujia39@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 net/8021q/vlan.c     |  4 +---
 net/core/dev.c       | 11 +++++++++++
 net/core/rtnetlink.c | 23 ++++++-----------------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 15bbfaf943fd..8b644113715e 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -284,9 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	return 0;
 
 out_free_newdev:
-	if (new_dev->reg_state == NETREG_UNINITIALIZED ||
-	    new_dev->reg_state == NETREG_UNREGISTERED)
-		free_netdev(new_dev);
+	free_netdev(new_dev);
 	return err;
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ae237a498569..b0c3dd268913 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10661,6 +10661,17 @@ void free_netdev(struct net_device *dev)
 	struct napi_struct *p, *n;
 
 	might_sleep();
+
+	/* When called immediately after register_netdevice() failed the unwind
+	 * handling may still be dismantling the device. Handle that case by
+	 * deferring the free.
+	 */
+	if (dev->reg_state == NETREG_UNREGISTERING) {
+		ASSERT_RTNL();
+		dev->needs_free_netdev = true;
+		return;
+	}
+
 	netif_free_tx_queues(dev);
 	netif_free_rx_queues(dev);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 27ffa83ffeb3..e08b506163f5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3438,26 +3438,15 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	dev->ifindex = ifm->ifi_index;
 
-	if (ops->newlink) {
+	if (ops->newlink)
 		err = ops->newlink(link_net ? : net, dev, tb, data, extack);
-		/* Drivers should call free_netdev() in ->destructor
-		 * and unregister it on failure after registration
-		 * so that device could be finally freed in rtnl_unlock.
-		 */
-		if (err < 0) {
-			/* If device is not registered at all, free it now */
-			if (dev->reg_state == NETREG_UNINITIALIZED ||
-			    dev->reg_state == NETREG_UNREGISTERED)
-				free_netdev(dev);
-			goto out;
-		}
-	} else {
+	else
 		err = register_netdevice(dev);
-		if (err < 0) {
-			free_netdev(dev);
-			goto out;
-		}
+	if (err < 0) {
+		free_netdev(dev);
+		goto out;
 	}
+
 	err = rtnl_configure_link(dev, ifm);
 	if (err < 0)
 		goto out_unregister;
-- 
Gitee


From 1bb3e6b185c454190f81d6f3a8f34fcbe49f0f0c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 15 Nov 2021 19:50:25 +0800
Subject: [PATCH 079/134] bpf: Add ambient BPF runtime context stored in
 current

mainline inclusion
from mainline-v5.15-rc1
commit c7603cfa04e7c3a435b31d065f7cbdc829428f6e
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c7603cfa04e7c3a435b31d065f7cbdc829428f6e

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
--------------------------------

b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage()
helper") fixed the problem with cgroup-local storage use in BPF by
pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate
possible BPF program preemptions and nested executions.

While this seems to work good in practice, it introduces new and unnecessary
failure mode in which not all BPF programs might be executed if we fail to
find an unused slot for cgroup storage, however unlikely it is. It might also
not be so unlikely when/if we allow sleepable cgroup BPF programs in the
future.

Further, the way that cgroup storage is implemented as ambiently-available
property during entire BPF program execution is a convenient way to pass extra
information to BPF program and helpers without requiring user code to pass
around extra arguments explicitly. So it would be good to have a generic
solution that can allow implementing this without arbitrary restrictions.
Ideally, such solution would work for both preemptable and sleepable BPF
programs in exactly the same way.

This patch introduces such solution, bpf_run_ctx. It adds one pointer field
(bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of
macros in such a way that it always stays valid throughout BPF program
execution. BPF program preemption is handled by remembering previous
current->bpf_ctx value locally while executing nested BPF program and
restoring old value after nested BPF program finishes. This is handled by two
helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are
supposed to be used before and after BPF program runs, respectively.

Restoring old value of the pointer handles preemption, while bpf_run_ctx
pointer being a property of current task_struct naturally solves this problem
for sleepable BPF programs by "following" BPF program execution as it is
scheduled in and out of CPU. It would even allow CPU migration of BPF
programs, even though it's not currently allowed by BPF infra.

This patch cleans up cgroup local storage handling as a first application. The
design itself is generic, though, with bpf_run_ctx being an empty struct that
is supposed to be embedded into a specific struct for a given BPF program type
(bpf_cg_run_ctx in this case). Follow up patches are planned that will expand
this mechanism for other uses within tracing BPF programs.

To verify that this change doesn't revert the fix to the original cgroup
storage issue, I ran the same repro as in the original report ([0]) and didn't
get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with
bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work).

  [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/

Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org
Conflicts:
	include/linux/bpf.h
	include/linux/sched.h
	kernel/fork.c
	net/bpf/test_run.c
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Reviewed-by: Kuohai Xu <xukuohai@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 include/linux/bpf-cgroup.h | 54 --------------------------------------
 include/linux/bpf.h        | 45 ++++++++++++++++++++++---------
 include/linux/sched.h      |  5 ++++
 kernel/bpf/helpers.c       | 16 ++++-------
 kernel/bpf/local_storage.c |  3 ---
 kernel/fork.c              |  3 +++
 net/bpf/test_run.c         | 22 ++++++++--------
 7 files changed, 57 insertions(+), 91 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 91b966978541..5120ca319ff4 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -27,19 +27,6 @@ struct task_struct;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
-#define BPF_CGROUP_STORAGE_NEST_MAX	8
-
-struct bpf_cgroup_storage_info {
-	struct task_struct *task;
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
-};
-
-/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks
- * to use bpf cgroup storage simultaneously.
- */
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
-		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
 
@@ -167,44 +154,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
-static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
-					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
-{
-	enum bpf_cgroup_storage_type stype;
-	int i, err = 0;
-
-	preempt_disable();
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
-			continue;
-
-		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
-		for_each_cgroup_storage_type(stype)
-			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
-				       storage[stype]);
-		goto out;
-	}
-	err = -EBUSY;
-	WARN_ON_ONCE(1);
-
-out:
-	preempt_enable();
-	return err;
-}
-
-static inline void bpf_cgroup_storage_unset(void)
-{
-	int i;
-
-	for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
-		if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
-			continue;
-
-		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
-		return;
-	}
-}
-
 struct bpf_cgroup_storage *
 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
 		      void *key, bool locked);
@@ -450,9 +399,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline int bpf_cgroup_storage_set(
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; }
-static inline void bpf_cgroup_storage_unset(void) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 474a0d852614..88245386a4b6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1081,11 +1081,20 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array);
 
+struct bpf_run_ctx {};
+
+struct bpf_cg_run_ctx {
+	struct bpf_run_ctx run_ctx;
+	struct bpf_prog_array_item *prog_item;
+};
+
 #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \
 	({						\
 		struct bpf_prog_array_item *_item;	\
 		struct bpf_prog *_prog;			\
 		struct bpf_prog_array *_array;		\
+		struct bpf_run_ctx *old_run_ctx;	\
+		struct bpf_cg_run_ctx run_ctx;		\
 		u32 _ret = 1;				\
 		migrate_disable();			\
 		rcu_read_lock();			\
@@ -1093,17 +1102,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		if (unlikely(check_non_null && !_array))\
 			goto _out;			\
 		_item = &_array->items[0];		\
-		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (!set_cg_storage) {			\
-				_ret &= func(_prog, ctx);	\
-			} else {				\
-				if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
-					break;			\
-				_ret &= func(_prog, ctx);	\
-				bpf_cgroup_storage_unset();	\
-			}				\
+		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\
+		while ((_prog = READ_ONCE(_item->prog))) {	\
+			run_ctx.prog_item = _item;	\
+			_ret &= func(_prog, ctx);	\
 			_item++;			\
 		}					\
+		bpf_reset_run_ctx(old_run_ctx);		\
 _out:							\
 		rcu_read_unlock();			\
 		migrate_enable();			\
@@ -1137,6 +1142,8 @@ _out:							\
 		struct bpf_prog_array_item *_item;	\
 		struct bpf_prog *_prog;			\
 		struct bpf_prog_array *_array;		\
+		struct bpf_run_ctx *old_run_ctx;	\
+		struct bpf_cg_run_ctx run_ctx;		\
 		u32 ret;				\
 		u32 _ret = 1;				\
 		u32 _cn = 0;				\
@@ -1144,15 +1151,15 @@ _out:							\
 		rcu_read_lock();			\
 		_array = rcu_dereference(array);	\
 		_item = &_array->items[0];		\
+		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);	\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
-				break;			\
+			run_ctx.prog_item = _item;	\
 			ret = func(_prog, ctx);		\
-			bpf_cgroup_storage_unset();	\
 			_ret &= (ret & 1);		\
 			_cn |= (ret & 2);		\
 			_item++;			\
 		}					\
+		bpf_reset_run_ctx(old_run_ctx);		\
 		rcu_read_unlock();			\
 		migrate_enable();			\
 		if (_ret)				\
@@ -1202,6 +1209,20 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
+static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
+{
+	struct bpf_run_ctx *old_ctx;
+
+	old_ctx = current->bpf_ctx;
+	current->bpf_ctx = new_ctx;
+	return old_ctx;
+}
+
+static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
+{
+	current->bpf_ctx = old_ctx;
+}
+
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b85b26d9ccef..53198ac3d154 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -40,6 +40,7 @@ struct audit_context;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
+struct bpf_run_ctx;
 struct capture_control;
 struct cfs_rq;
 struct fs_struct;
@@ -1340,6 +1341,10 @@ struct task_struct {
 	/* Used by LSM modules for access restriction: */
 	void				*security;
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+	/* Used for BPF run context */
+	struct bpf_run_ctx		*bpf_ctx;
+#endif
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 	unsigned long			lowest_stack;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 0efe7c7bfe5e..bb4350de9f11 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -372,8 +372,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
-		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -382,17 +380,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage *storage = NULL;
+	struct bpf_cgroup_storage *storage;
+	struct bpf_cg_run_ctx *ctx;
 	void *ptr;
-	int i;
 
-	for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
-		if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
-			continue;
-
-		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
-		break;
-	}
+	/* get current cgroup storage from BPF run context */
+	ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+	storage = ctx->prog_item->cgroup_storage[stype];
 
 	if (stype == BPF_CGROUP_STORAGE_SHARED)
 		ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index b139247d2dd3..49ca8771d470 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -11,9 +11,6 @@
 
 #ifdef CONFIG_CGROUP_BPF
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
-	       bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
 #include "../cgroup/cgroup-internal.h"
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
diff --git a/kernel/fork.c b/kernel/fork.c
index f019852b2e61..39b1783a7613 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2072,6 +2072,9 @@ static __latent_entropy struct task_struct *copy_process(
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+	p->bpf_ctx = NULL;
+#endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index eb684f31fd69..99712d35e535 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -19,18 +19,20 @@
 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 			u32 *retval, u32 *time, bool xdp)
 {
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
+	struct bpf_prog_array_item item = {.prog = prog};
+	struct bpf_run_ctx *old_ctx;
+	struct bpf_cg_run_ctx run_ctx;
 	enum bpf_cgroup_storage_type stype;
 	u64 time_start, time_spent = 0;
 	int ret = 0;
 	u32 i;
 
 	for_each_cgroup_storage_type(stype) {
-		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
-		if (IS_ERR(storage[stype])) {
-			storage[stype] = NULL;
+		item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+		if (IS_ERR(item.cgroup_storage[stype])) {
+			item.cgroup_storage[stype] = NULL;
 			for_each_cgroup_storage_type(stype)
-				bpf_cgroup_storage_free(storage[stype]);
+				bpf_cgroup_storage_free(item.cgroup_storage[stype]);
 			return -ENOMEM;
 		}
 	}
@@ -41,18 +43,15 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 	rcu_read_lock();
 	migrate_disable();
 	time_start = ktime_get_ns();
+	old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	for (i = 0; i < repeat; i++) {
-		ret = bpf_cgroup_storage_set(storage);
-		if (ret)
-			break;
+		run_ctx.prog_item = &item;
 
 		if (xdp)
 			*retval = bpf_prog_run_xdp(prog, ctx);
 		else
 			*retval = BPF_PROG_RUN(prog, ctx);
 
-		bpf_cgroup_storage_unset();
-
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
@@ -70,6 +69,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 			time_start = ktime_get_ns();
 		}
 	}
+	bpf_reset_run_ctx(old_ctx);
 	time_spent += ktime_get_ns() - time_start;
 	migrate_enable();
 	rcu_read_unlock();
@@ -78,7 +78,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 	*time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
 
 	for_each_cgroup_storage_type(stype)
-		bpf_cgroup_storage_free(storage[stype]);
+		bpf_cgroup_storage_free(item.cgroup_storage[stype]);
 
 	return ret;
 }
-- 
Gitee


From 79a5b091735baf1f3580e94fa623b9683b958fe4 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:50:26 +0800
Subject: [PATCH 080/134] nbd: add sanity check for first_minor

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

When user pass 0x100000 as index, nbd will end up create sysfs dir
"/sys/block/43:0":

nbd_dev_add
 disk->first_minor = index << part_shift
 -> default part_shift is 5, 0x100000 << 5 = 0x2000000
  device_add_disk
   blk_alloc_devt
    MKDEV(disk->major, disk->first_minor + part->partno)
    -> (0x2b << 20) | (0x2000000) = 0x2b00000
   register_disk
    device_add
     device_create_sys_dev_entry
      format_dev_t
       MAJOR(devt) -> 0x2b00000 >> 20 = 0x2b
       MINOR(devt) -> 0x2b00000 & 0xfffff = 0
      sysfs_create_link -> /sys/block/43:0

If nbd created device with index 0 aready, then sysfs will compalin
about dumplicated creation.

On the other hand, the similar dumplicated creation will happen if
"index << part_shift" over flow to a value that is less than MINORMASK.

Thus fix the problem by adding sanity check for first_minor.

Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/block/nbd.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f3896f1c27f4..07b06fc6f70e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1821,7 +1821,18 @@ static int nbd_dev_add(int index)
 	refcount_set(&nbd->refs, 1);
 	INIT_LIST_HEAD(&nbd->list);
 	disk->major = NBD_MAJOR;
+
+	/*
+	 * Too big index can cause duplicate creation of sysfs files/links,
+	 * because MKDEV() expect that the max first minor is MINORMASK, or
+	 * index << part_shift can overflow.
+	 */
 	disk->first_minor = index << part_shift;
+	if (disk->first_minor < index || disk->first_minor > MINORMASK) {
+		err = -EINVAL;
+		goto out_free_tags;
+	}
+
 	disk->fops = &nbd_fops;
 	disk->private_data = nbd;
 	sprintf(disk->disk_name, "nbd%d", index);
-- 
Gitee


From 426e5a4f192cce28968b64011bf7f2c34bf95b58 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Nov 2021 19:50:27 +0800
Subject: [PATCH 081/134] io_uring: deduplicate failing task_work_add

mainline inclusion
from mainline-5.12-rc1
commit eab30c4d20dc761d463445e5130421863ff81505
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=eab30c4d20dc761d463445e5130421863ff81505

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

When io_req_task_work_add() fails, the request will be cancelled by
enqueueing via task_works of io-wq. Extract a function for that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflicts:
	fs/io_uring.c
	[ 355fb9e2b78e78("io_uring: remove 'twa_signal_ok' deadlock
	  work-around") is not applied. ]

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/io_uring.c | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 104dff9c7131..82b301871224 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2056,6 +2056,16 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
 	return ret;
 }
 
+static void io_req_task_work_add_fallback(struct io_kiocb *req,
+					  void (*cb)(struct callback_head *))
+{
+	struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+
+	init_task_work(&req->task_work, cb);
+	task_work_add(tsk, &req->task_work, TWA_NONE);
+	wake_up_process(tsk);
+}
+
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -2113,14 +2123,8 @@ static void io_req_task_queue(struct io_kiocb *req)
 	percpu_ref_get(&req->ctx->refs);
 
 	ret = io_req_task_work_add(req, true);
-	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
-		init_task_work(&req->task_work, io_req_task_cancel);
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
-	}
+	if (unlikely(ret))
+		io_req_task_work_add_fallback(req, io_req_task_cancel);
 }
 
 static void io_queue_next(struct io_kiocb *req)
@@ -2239,13 +2243,8 @@ static void io_free_req_deferred(struct io_kiocb *req)
 
 	init_task_work(&req->task_work, io_put_req_deferred_cb);
 	ret = io_req_task_work_add(req, true);
-	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
-	}
+	if (unlikely(ret))
+		io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
 }
 
 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
@@ -3345,15 +3344,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	/* submit ref gets dropped, acquire a new one */
 	refcount_inc(&req->refs);
 	ret = io_req_task_work_add(req, true);
-	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
-		/* queue just for cancelation */
-		init_task_work(&req->task_work, io_req_task_cancel);
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
-	}
+	if (unlikely(ret))
+		io_req_task_work_add_fallback(req, io_req_task_cancel);
 	return 1;
 }
 
@@ -4963,12 +4955,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 	 */
 	ret = io_req_task_work_add(req, twa_signal_ok);
 	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
 		WRITE_ONCE(poll->canceled, true);
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
+		io_req_task_work_add_fallback(req, func);
 	}
 	return 1;
 }
-- 
Gitee


From 99af00b884a4beb84425bed3c28403d63947c325 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Nov 2021 19:50:28 +0800
Subject: [PATCH 082/134] io_uring: don't take uring_lock during iowq cancel

mainline inclusion
from mainline-5.12-rc1
commit 792bb6eb862333658bf1bd2260133f0507e2da8d
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=792bb6eb862333658bf1bd2260133f0507e2da8d

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

[   97.866748] a.out/2890 is trying to acquire lock:
[   97.867829] ffff8881046763e8 (&ctx->uring_lock){+.+.}-{3:3}, at:
io_wq_submit_work+0x155/0x240
[   97.869735]
[   97.869735] but task is already holding lock:
[   97.871033] ffff88810dfe0be8 (&ctx->uring_lock){+.+.}-{3:3}, at:
__x64_sys_io_uring_enter+0x3f0/0x5b0
[   97.873074]
[   97.873074] other info that might help us debug this:
[   97.874520]  Possible unsafe locking scenario:
[   97.874520]
[   97.875845]        CPU0
[   97.876440]        ----
[   97.877048]   lock(&ctx->uring_lock);
[   97.877961]   lock(&ctx->uring_lock);
[   97.878881]
[   97.878881]  *** DEADLOCK ***
[   97.878881]
[   97.880341]  May be due to missing lock nesting notation
[   97.880341]
[   97.881952] 1 lock held by a.out/2890:
[   97.882873]  #0: ffff88810dfe0be8 (&ctx->uring_lock){+.+.}-{3:3}, at:
__x64_sys_io_uring_enter+0x3f0/0x5b0
[   97.885108]
[   97.885108] stack backtrace:
[   97.890457] Call Trace:
[   97.891121]  dump_stack+0xac/0xe3
[   97.891972]  __lock_acquire+0xab6/0x13a0
[   97.892940]  lock_acquire+0x2c3/0x390
[   97.894894]  __mutex_lock+0xae/0x9f0
[   97.901101]  io_wq_submit_work+0x155/0x240
[   97.902112]  io_wq_cancel_cb+0x162/0x490
[   97.904126]  io_async_find_and_cancel+0x3b/0x140
[   97.905247]  io_issue_sqe+0x86d/0x13e0
[   97.909122]  __io_queue_sqe+0x10b/0x550
[   97.913971]  io_queue_sqe+0x235/0x470
[   97.914894]  io_submit_sqes+0xcce/0xf10
[   97.917872]  __x64_sys_io_uring_enter+0x3fb/0x5b0
[   97.921424]  do_syscall_64+0x2d/0x40
[   97.922329]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

While holding uring_lock, e.g. from inline execution, async cancel
request may attempt cancellations through io_wq_submit_work, which may
try to grab a lock. Delay it to task_work, so we do it from a clean
context and don't have to worry about locking.

Cc: <stable@vger.kernel.org> # 5.5+
Fixes: c07e6719511e ("io_uring: hold uring_lock while completing failed polled io in io_wq_submit_work()")
Reported-by: Abaci <abaci@linux.alibaba.com>
Reported-by: Hao Xu <haoxu@linux.alibaba.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflicts:
	[ 5280f7e530f7("io_uring/io-wq: return 2-step work swap scheme") is
	  not applied. ]

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/io_uring.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 82b301871224..f3078708fdb8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6135,7 +6135,11 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 	/* if NO_CANCEL is set, we must still run the work */
 	if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
 				IO_WQ_WORK_CANCEL) {
-		ret = -ECANCELED;
+		/* io-wq is going to take down one */
+		refcount_inc(&req->refs);
+		percpu_ref_get(&req->ctx->refs);
+		io_req_task_work_add_fallback(req, io_req_task_cancel);
+		return io_steal_work(req);
 	}
 
 	if (!ret) {
-- 
Gitee


From 851d3c8e596387cf48e9d62fdfc153cdd7e45e17 Mon Sep 17 00:00:00 2001
From: Wang Wensheng <wangwensheng4@huawei.com>
Date: Mon, 15 Nov 2021 19:50:29 +0800
Subject: [PATCH 083/134] ALSA: timer: Fix use-after-free problem

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://lore.kernel.org/all/3b02dd76-d952-e38e-bc0c-c8a121919720@huawei.com/T/#u

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

When the timer instance was add into ack_list but was not currently in
process, the user could stop it via snd_timer_stop1() without delete it
from the ack_list. Then the user could free the timer instance and when
it was actually processed UAF occurred.

This issue could be reproduced via testcase snd_timer01 in ltp - running
several instances of that testcase at the same time.

What I actually met was that the ack_list of the timer broken and the
kernel went into deadloop with irqoff. That could be detected by
hardlockup detector on board or when we run it on qemu, we could use gdb
to dump the ack_list when the console has no response.

To fix this issue, we delete the timer instance from ack_list and
active_list unconditionally in snd_timer_stop1().

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Suggested-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 sound/core/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sound/core/timer.c b/sound/core/timer.c
index c15c8314671b..963ddd9e7715 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -624,13 +624,13 @@ static int snd_timer_stop1(struct snd_timer_instance *timeri, bool stop)
 	if (!timer)
 		return -EINVAL;
 	spin_lock_irqsave(&timer->lock, flags);
+	list_del_init(&timeri->ack_list);
+	list_del_init(&timeri->active_list);
 	if (!(timeri->flags & (SNDRV_TIMER_IFLG_RUNNING |
 			       SNDRV_TIMER_IFLG_START))) {
 		result = -EBUSY;
 		goto unlock;
 	}
-	list_del_init(&timeri->ack_list);
-	list_del_init(&timeri->active_list);
 	if (timer->card && timer->card->shutdown)
 		goto unlock;
 	if (stop) {
-- 
Gitee


From 2ac73854cb405bbb631ab4ad0f3acab050494770 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:52:11 +0800
Subject: [PATCH 084/134] blk-cgroup: synchronize blkg creation against policy
 deactivation

mainline inclusion
from mainline
commit 0c9d338c8443b06da8e8d3bfce824c5ea6d3488f
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c9d338c8443b06da8e8d3bfce824c5ea6d3488f

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Our test reports a null pointer dereference:

[  168.534653] ==================================================================
[  168.535614] Disabling lock debugging due to kernel taint
[  168.536346] BUG: kernel NULL pointer dereference, address: 0000000000000008
[  168.537274] #PF: supervisor read access in kernel mode
[  168.537964] #PF: error_code(0x0000) - not-present page
[  168.538667] PGD 0 P4D 0
[  168.539025] Oops: 0000 [#1] PREEMPT SMP KASAN
[  168.539656] CPU: 13 PID: 759 Comm: bash Tainted: G    B             5.15.0-rc2-next-202100
[  168.540954] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_0738364
[  168.542736] RIP: 0010:bfq_pd_init+0x88/0x1e0
[  168.543318] Code: 98 00 00 00 e8 c9 e4 5b ff 4c 8b 65 00 49 8d 7c 24 08 e8 bb e4 5b ff 4d0
[  168.545803] RSP: 0018:ffff88817095f9c0 EFLAGS: 00010002
[  168.546497] RAX: 0000000000000001 RBX: ffff888101a1c000 RCX: 0000000000000000
[  168.547438] RDX: 0000000000000003 RSI: 0000000000000002 RDI: ffff888106553428
[  168.548402] RBP: ffff888106553400 R08: ffffffff961bcaf4 R09: 0000000000000001
[  168.549365] R10: ffffffffa2e16c27 R11: fffffbfff45c2d84 R12: 0000000000000000
[  168.550291] R13: ffff888101a1c098 R14: ffff88810c7a08c8 R15: ffffffffa55541a0
[  168.551221] FS:  00007fac75227700(0000) GS:ffff88839ba80000(0000) knlGS:0000000000000000
[  168.552278] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  168.553040] CR2: 0000000000000008 CR3: 0000000165ce7000 CR4: 00000000000006e0
[  168.554000] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  168.554929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  168.555888] Call Trace:
[  168.556221]  <TASK>
[  168.556510]  blkg_create+0x1c0/0x8c0
[  168.556989]  blkg_conf_prep+0x574/0x650
[  168.557502]  ? stack_trace_save+0x99/0xd0
[  168.558033]  ? blkcg_conf_open_bdev+0x1b0/0x1b0
[  168.558629]  tg_set_conf.constprop.0+0xb9/0x280
[  168.559231]  ? kasan_set_track+0x29/0x40
[  168.559758]  ? kasan_set_free_info+0x30/0x60
[  168.560344]  ? tg_set_limit+0xae0/0xae0
[  168.560853]  ? do_sys_openat2+0x33b/0x640
[  168.561383]  ? do_sys_open+0xa2/0x100
[  168.561877]  ? __x64_sys_open+0x4e/0x60
[  168.562383]  ? __kasan_check_write+0x20/0x30
[  168.562951]  ? copyin+0x48/0x70
[  168.563390]  ? _copy_from_iter+0x234/0x9e0
[  168.563948]  tg_set_conf_u64+0x17/0x20
[  168.564467]  cgroup_file_write+0x1ad/0x380
[  168.565014]  ? cgroup_file_poll+0x80/0x80
[  168.565568]  ? __mutex_lock_slowpath+0x30/0x30
[  168.566165]  ? pgd_free+0x100/0x160
[  168.566649]  kernfs_fop_write_iter+0x21d/0x340
[  168.567246]  ? cgroup_file_poll+0x80/0x80
[  168.567796]  new_sync_write+0x29f/0x3c0
[  168.568314]  ? new_sync_read+0x410/0x410
[  168.568840]  ? __handle_mm_fault+0x1c97/0x2d80
[  168.569425]  ? copy_page_range+0x2b10/0x2b10
[  168.570007]  ? _raw_read_lock_bh+0xa0/0xa0
[  168.570622]  vfs_write+0x46e/0x630
[  168.571091]  ksys_write+0xcd/0x1e0
[  168.571563]  ? __x64_sys_read+0x60/0x60
[  168.572081]  ? __kasan_check_write+0x20/0x30
[  168.572659]  ? do_user_addr_fault+0x446/0xff0
[  168.573264]  __x64_sys_write+0x46/0x60
[  168.573774]  do_syscall_64+0x35/0x80
[  168.574264]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  168.574960] RIP: 0033:0x7fac74915130
[  168.575456] Code: 73 01 c3 48 8b 0d 58 ed 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 444
[  168.577969] RSP: 002b:00007ffc3080e288 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  168.578986] RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007fac74915130
[  168.579937] RDX: 0000000000000009 RSI: 000056007669f080 RDI: 0000000000000001
[  168.580884] RBP: 000056007669f080 R08: 000000000000000a R09: 00007fac75227700
[  168.581841] R10: 000056007655c8f0 R11: 0000000000000246 R12: 0000000000000009
[  168.582796] R13: 0000000000000001 R14: 00007fac74be55e0 R15: 00007fac74be08c0
[  168.583757]  </TASK>
[  168.584063] Modules linked in:
[  168.584494] CR2: 0000000000000008
[  168.584964] ---[ end trace 2475611ad0f77a1a ]---

This is because blkg_alloc() is called from blkg_conf_prep() without
holding 'q->queue_lock', and elevator is exited before blkg_create():

thread 1                            thread 2
blkg_conf_prep
 spin_lock_irq(&q->queue_lock);
 blkg_lookup_check -> return NULL
 spin_unlock_irq(&q->queue_lock);

 blkg_alloc
  blkcg_policy_enabled -> true
  pd = ->pd_alloc_fn
  blkg->pd[i] = pd
                                   blk_mq_exit_sched
                                    bfq_exit_queue
                                     blkcg_deactivate_policy
                                      spin_lock_irq(&q->queue_lock);
                                      __clear_bit(pol->plid, q->blkcg_pols);
                                      spin_unlock_irq(&q->queue_lock);
                                    q->elevator = NULL;
  spin_lock_irq(&q->queue_lock);
   blkg_create
    if (blkg->pd[i])
     ->pd_init_fn -> q->elevator is NULL
  spin_unlock_irq(&q->queue_lock);

Because blkcg_deactivate_policy() requires queue to be frozen, we can
grab q_usage_counter to synchoronize blkg_conf_prep() against
blkcg_deactivate_policy().

Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20211020014036.2141723-1-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflict: block/blk-cgroup.c
 - commit ed6cddefdfd3 ("block: convert the rest of block to
 bdev_get_queue") is not backported.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-cgroup.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5b19665bc486..37a5dbd2c4e4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -620,6 +620,14 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
 	q = disk->queue;
 
+	/*
+	 * blkcg_deactivate_policy() requires queue to be frozen, we can grab
+	 * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
+	 */
+	ret = blk_queue_enter(q, 0);
+	if (ret)
+		return ret;
+
 	rcu_read_lock();
 	spin_lock_irq(&q->queue_lock);
 
@@ -689,6 +697,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 			goto success;
 	}
 success:
+	blk_queue_exit(q);
 	ctx->disk = disk;
 	ctx->blkg = blkg;
 	ctx->body = input;
@@ -701,6 +710,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	rcu_read_unlock();
 fail:
 	put_disk_and_module(disk);
+	blk_queue_exit(q);
 	/*
 	 * If queue was bypassing, we should retry.  Do so after a
 	 * short msleep().  It isn't strictly necessary but queue
-- 
Gitee


From 1249d6447661fab8737fc872a500116c622b349a Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 15 Nov 2021 19:52:12 +0800
Subject: [PATCH 085/134] blk-cgroup: fix missing put device in error path from
 blkg_conf_pref()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

If blk_queue_enter() failed due to queue is dying, the
blkdev_put_no_open() is needed because blkcg_conf_open_bdev() succeeded.

Fixes: 0c9d338c8443 ("blk-cgroup: synchronize blkg creation against policy deactivation")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-cgroup.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 37a5dbd2c4e4..40f15807efec 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -626,7 +626,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	 */
 	ret = blk_queue_enter(q, 0);
 	if (ret)
-		return ret;
+		goto fail;
 
 	rcu_read_lock();
 	spin_lock_irq(&q->queue_lock);
@@ -662,13 +662,13 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto fail;
+			goto fail_exit_queue;
 		}
 
 		if (radix_tree_preload(GFP_KERNEL)) {
 			blkg_free(new_blkg);
 			ret = -ENOMEM;
-			goto fail;
+			goto fail_exit_queue;
 		}
 
 		rcu_read_lock();
@@ -708,9 +708,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 fail_unlock:
 	spin_unlock_irq(&q->queue_lock);
 	rcu_read_unlock();
+fail_exit_queue:
+	blk_queue_exit(q);
 fail:
 	put_disk_and_module(disk);
-	blk_queue_exit(q);
 	/*
 	 * If queue was bypassing, we should retry.  Do so after a
 	 * short msleep().  It isn't strictly necessary but queue
-- 
Gitee


From 0a75f7dd4ebd75263f33680d7dd6c8451f48757b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:13 +0800
Subject: [PATCH 086/134] ubifs: rename_whiteout: Fix double free for
 whiteout_ui->data

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

'whiteout_ui->data' will be freed twice if space budget fail for
rename whiteout operation as following process:

rename_whiteout
  dev = kmalloc
  whiteout_ui->data = dev
  kfree(whiteout_ui->data)  // Free first time
  iput(whiteout)
    ubifs_free_inode
      kfree(ui->data)	    // Double free!

KASAN reports:
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

==================================================================
BUG: KASAN: double-free or invalid-free in ubifs_free_inode+0x4f/0x70
Call Trace:
  kfree+0x117/0x490
  ubifs_free_inode+0x4f/0x70 [ubifs]
  i_callback+0x30/0x60
  rcu_do_batch+0x366/0xac0
  __do_softirq+0x133/0x57f

Allocated by task 1506:
  kmem_cache_alloc_trace+0x3c2/0x7a0
  do_rename+0x9b7/0x1150 [ubifs]
  ubifs_rename+0x106/0x1f0 [ubifs]
  do_syscall_64+0x35/0x80

Freed by task 1506:
  kfree+0x117/0x490
  do_rename.cold+0x53/0x8a [ubifs]
  ubifs_rename+0x106/0x1f0 [ubifs]
  do_syscall_64+0x35/0x80

The buggy address belongs to the object at ffff88810238bed8 which
belongs to the cache kmalloc-8 of size 8
==================================================================

Let ubifs_free_inode() free 'whiteout_ui->data'. BTW, delete unused
assignment 'whiteout_ui->data_len = 0', process 'ubifs_evict_inode()
-> ubifs_jnl_delete_inode() -> ubifs_jnl_write_inode()' doesn't need it
(because 'inc_nlink(whiteout)' won't be excuted by 'goto out_release',
 and the nlink of whiteout inode is 0).

Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/dir.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ad90a3a64293..48dbd91d8b0e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1423,8 +1423,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 		err = ubifs_budget_space(c, &wht_req);
 		if (err) {
-			kfree(whiteout_ui->data);
-			whiteout_ui->data_len = 0;
 			iput(whiteout);
 			goto out_release;
 		}
-- 
Gitee


From 5c176c8ffe591fd09fed83d7cb9a42e8cd14d72d Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:14 +0800
Subject: [PATCH 087/134] ubifs: Fix deadlock in concurrent rename whiteout and
 inode writeback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Following hung tasks:
[   77.028764] task:kworker/u8:4    state:D stack:    0 pid:  132
[   77.028820] Call Trace:
[   77.029027]  schedule+0x8c/0x1b0
[   77.029067]  mutex_lock+0x50/0x60
[   77.029074]  ubifs_write_inode+0x68/0x1f0 [ubifs]
[   77.029117]  __writeback_single_inode+0x43c/0x570
[   77.029128]  writeback_sb_inodes+0x259/0x740
[   77.029148]  wb_writeback+0x107/0x4d0
[   77.029163]  wb_workfn+0x162/0x7b0

[   92.390442] task:aa              state:D stack:    0 pid: 1506
[   92.390448] Call Trace:
[   92.390458]  schedule+0x8c/0x1b0
[   92.390461]  wb_wait_for_completion+0x82/0xd0
[   92.390469]  __writeback_inodes_sb_nr+0xb2/0x110
[   92.390472]  writeback_inodes_sb_nr+0x14/0x20
[   92.390476]  ubifs_budget_space+0x705/0xdd0 [ubifs]
[   92.390503]  do_rename.cold+0x7f/0x187 [ubifs]
[   92.390549]  ubifs_rename+0x8b/0x180 [ubifs]
[   92.390571]  vfs_rename+0xdb2/0x1170
[   92.390580]  do_renameat2+0x554/0x770

, are caused by concurrent rename whiteout and inode writeback processes:
	rename_whiteout(Thread 1)	        wb_workfn(Thread2)
ubifs_rename
  do_rename
    lock_4_inodes (Hold ui_mutex)
    ubifs_budget_space
      make_free_space
        shrink_liability
	  __writeback_inodes_sb_nr
	    bdi_split_work_to_wbs (Queue new wb work)
					      wb_do_writeback(wb work)
						__writeback_single_inode
					          ubifs_write_inode
					            LOCK(ui_mutex)
							   ↑
	      wb_wait_for_completion (Wait wb work) <-- deadlock!

Reproducer (Detail program in [Link]):
  1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT)
  2. Consume out of space before kernel(mdelay) doing budget for whiteout

Fix it by doing whiteout space budget before locking ubifs inodes.
BTW, it also fixes wrong goto tag 'out_release' in whiteout budget
error handling path(It should at least recover dir i_size and unlock
4 ubifs inodes).

Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/dir.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 48dbd91d8b0e..e0b2dbc36517 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1322,6 +1322,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	if (flags & RENAME_WHITEOUT) {
 		union ubifs_dev_desc *dev = NULL;
+		struct ubifs_budget_req wht_req;
 
 		dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
 		if (!dev) {
@@ -1343,6 +1344,20 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 		whiteout_ui->data = dev;
 		whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
 		ubifs_assert(c, !whiteout_ui->dirty);
+
+		memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
+		wht_req.dirtied_ino = 1;
+		wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8);
+		/*
+		 * To avoid deadlock between space budget (holds ui_mutex and
+		 * waits wb work) and writeback work(waits ui_mutex), do space
+		 * budget before ubifs inodes locked.
+		 */
+		err = ubifs_budget_space(c, &wht_req);
+		if (err) {
+			iput(whiteout);
+			goto out_release;
+		}
 	}
 
 	lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
@@ -1417,16 +1432,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	if (whiteout) {
-		struct ubifs_budget_req wht_req = { .dirtied_ino = 1,
-				.dirtied_ino_d = \
-				ALIGN(ubifs_inode(whiteout)->data_len, 8) };
-
-		err = ubifs_budget_space(c, &wht_req);
-		if (err) {
-			iput(whiteout);
-			goto out_release;
-		}
-
 		inc_nlink(whiteout);
 		mark_inode_dirty(whiteout);
 
-- 
Gitee


From a7932d34e546c17056949d064c43d1aba34f2477 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:15 +0800
Subject: [PATCH 088/134] ubifs: Fix wrong number of inodes locked by ui_mutex
 in ubifs_inode comment

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Since 9ec64962afb1702f75b("ubifs: Implement RENAME_EXCHANGE") and
9e0a1fff8db56eaaebb("ubifs: Implement RENAME_WHITEOUT") are applied,
ubifs_rename locks and changes 4 ubifs inodes, correct the comment
for ui_mutex in ubifs_inode.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/ubifs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index daad22263cb2..9a4a3191ed07 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -372,7 +372,7 @@ struct ubifs_gced_idx_leb {
  * @ui_mutex exists for two main reasons. At first it prevents inodes from
  * being written back while UBIFS changing them, being in the middle of an VFS
  * operation. This way UBIFS makes sure the inode fields are consistent. For
- * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and
  * write-back must not write any of them before we have finished.
  *
  * The second reason is budgeting - UBIFS has to budget all operations. If an
-- 
Gitee


From 7b9e731eedbefc98a85c752f411026cfb400ca75 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:16 +0800
Subject: [PATCH 089/134] ubifs: Add missing iput if do_tmpfile() failed in
 rename whiteout

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

whiteout inode should be put when do_tmpfile() failed if inode has been
initialized. Otherwise we will get following warning during umount:
  UBIFS error (ubi0:0 pid 1494): ubifs_assert_failed [ubifs]: UBIFS
  assert failed: c->bi.dd_growth == 0, in fs/ubifs/super.c:1930
  VFS: Busy inodes after unmount of ubifs. Self-destruct in 5 seconds.

Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/dir.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e0b2dbc36517..9888858bb9e5 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1332,6 +1332,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 		err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout);
 		if (err) {
+			if (whiteout)
+				iput(whiteout);
 			kfree(dev);
 			goto out_release;
 		}
-- 
Gitee


From 2c7f9ab9ec94878c05d9ee9a9d37068639b16d39 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:17 +0800
Subject: [PATCH 090/134] ubifs: Rename whiteout atomically

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Currently, rename whiteout has 3 steps:
  1. create tmpfile(which associates old dentry to tmpfile inode) for
     whiteout, and store tmpfile to disk
  2. link whiteout, associate whiteout inode to old dentry agagin and
     store old dentry, old inode, new dentry on disk
  3. writeback dirty whiteout inode to disk

Suddenly power-cut or error occurring(eg. ENOSPC returned by budget,
memory allocation failure) during above steps may cause kinds of problems:
  Problem 1: ENOSPC returned by whiteout space budget (before step 2),
	     old dentry will disappear after rename syscall, whiteout file
	     cannot be found either.

	     ls dir  // we get file, whiteout
	     rename(dir/file, dir/whiteout, REANME_WHITEOUT)
	     ENOSPC = ubifs_budget_space(&wht_req) // return
	     ls dir  // empty (no file, no whiteout)
  Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1'
	     is not stored on disk, whiteout dentry(old dentry) is written
	     on disk, whiteout file is lost on next mount (We get "dead
	     directory entry" after executing 'ls -l' on whiteout file).

Now, we use following 3 steps to finish rename whiteout:
  1. create an in-mem inode with 'nlink = 1' as whiteout
  2. ubifs_jnl_rename (Write on disk to finish associating old dentry to
     whiteout inode, associating new dentry with old inode)
  3. iput(whiteout)

Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename
whiteout, which avoids middle disk state caused by suddenly power-cut
and error occurring.

Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/dir.c     | 144 +++++++++++++++++++++++++++++----------------
 fs/ubifs/journal.c |  52 +++++++++++++---
 2 files changed, 136 insertions(+), 60 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9888858bb9e5..5d64d829eda1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -348,8 +348,56 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-static int do_tmpfile(struct inode *dir, struct dentry *dentry,
-		      umode_t mode, struct inode **whiteout)
+static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
+{
+	int err;
+	umode_t mode = S_IFCHR | WHITEOUT_MODE;
+	struct inode *inode;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct fscrypt_name nm;
+
+	/*
+	 * Create an inode('nlink = 1') for whiteout without updating journal,
+	 * let ubifs_jnl_rename() store it on flash to complete rename whiteout
+	 * atomically.
+	 */
+
+	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+		dentry, mode, dir->i_ino);
+
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+	if (err)
+		return ERR_PTR(err);
+
+	inode = ubifs_new_inode(c, dir, mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_free;
+	}
+
+	init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+	ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
+
+	err = ubifs_init_security(dir, inode, &dentry->d_name);
+	if (err)
+		goto out_inode;
+
+	/* The dir size is updated by do_rename. */
+	insert_inode_hash(inode);
+
+	return inode;
+
+out_inode:
+	make_bad_inode(inode);
+	iput(inode);
+out_free:
+	fscrypt_free_filename(&nm);
+	ubifs_err(c, "cannot create whiteout file, error %d", err);
+	return ERR_PTR(err);
+}
+
+static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
+			 umode_t mode)
 {
 	struct inode *inode;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
@@ -391,25 +439,13 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
 	}
 	ui = ubifs_inode(inode);
 
-	if (whiteout) {
-		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
-		ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
-	}
-
 	err = ubifs_init_security(dir, inode, &dentry->d_name);
 	if (err)
 		goto out_inode;
 
 	mutex_lock(&ui->ui_mutex);
 	insert_inode_hash(inode);
-
-	if (whiteout) {
-		mark_inode_dirty(inode);
-		drop_nlink(inode);
-		*whiteout = inode;
-	} else {
-		d_tmpfile(dentry, inode);
-	}
+	d_tmpfile(dentry, inode);
 	ubifs_assert(c, ui->dirty);
 
 	instantiated = 1;
@@ -440,12 +476,6 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
-			 umode_t mode)
-{
-	return do_tmpfile(dir, dentry, mode, NULL);
-}
-
 /**
  * vfs_dent_type - get VFS directory entry type.
  * @type: UBIFS directory entry type
@@ -1262,17 +1292,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 					.dirtied_ino = 3 };
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
+	struct ubifs_budget_req wht_req;
 	struct timespec64 time;
 	unsigned int saved_nlink;
 	struct fscrypt_name old_nm, new_nm;
 
 	/*
-	 * Budget request settings: deletion direntry, new direntry, removing
-	 * the old inode, and changing old and new parent directory inodes.
+	 * Budget request settings:
+	 *   req: deletion direntry, new direntry, removing the old inode,
+	 *   and changing old and new parent directory inodes.
 	 *
-	 * However, this operation also marks the target inode as dirty and
-	 * does not write it, so we allocate budget for the target inode
-	 * separately.
+	 *   wht_req: new whiteout inode for RENAME_WHITEOUT.
+	 *
+	 *   ino_req: marks the target inode as dirty and does not write it.
 	 */
 
 	dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x",
@@ -1322,7 +1354,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	if (flags & RENAME_WHITEOUT) {
 		union ubifs_dev_desc *dev = NULL;
-		struct ubifs_budget_req wht_req;
 
 		dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
 		if (!dev) {
@@ -1330,26 +1361,26 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_release;
 		}
 
-		err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout);
-		if (err) {
-			if (whiteout)
-				iput(whiteout);
+		/*
+		 * The whiteout inode without dentry is pinned in memory,
+		 * umount won't happen during rename process because we
+		 * got parent dentry.
+		 */
+		whiteout = create_whiteout(old_dir, old_dentry);
+		if (IS_ERR(whiteout)) {
+			err = PTR_ERR(whiteout);
 			kfree(dev);
 			goto out_release;
 		}
 
-		spin_lock(&whiteout->i_lock);
-		whiteout->i_state |= I_LINKABLE;
-		spin_unlock(&whiteout->i_lock);
-
 		whiteout_ui = ubifs_inode(whiteout);
 		whiteout_ui->data = dev;
 		whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
 		ubifs_assert(c, !whiteout_ui->dirty);
 
 		memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
-		wht_req.dirtied_ino = 1;
-		wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8);
+		wht_req.new_ino = 1;
+		wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8);
 		/*
 		 * To avoid deadlock between space budget (holds ui_mutex and
 		 * waits wb work) and writeback work(waits ui_mutex), do space
@@ -1357,6 +1388,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 */
 		err = ubifs_budget_space(c, &wht_req);
 		if (err) {
+			/*
+			 * Whiteout inode can not be written on flash by
+			 * ubifs_jnl_write_inode(), because it's neither
+			 * dirty nor zero-nlink.
+			 */
 			iput(whiteout);
 			goto out_release;
 		}
@@ -1431,17 +1467,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 		sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
 		if (unlink && IS_SYNC(new_inode))
 			sync = 1;
-	}
-
-	if (whiteout) {
-		inc_nlink(whiteout);
-		mark_inode_dirty(whiteout);
-
-		spin_lock(&whiteout->i_lock);
-		whiteout->i_state &= ~I_LINKABLE;
-		spin_unlock(&whiteout->i_lock);
-
-		iput(whiteout);
+		/*
+		 * S_SYNC flag of whiteout inherits from the old_dir, and we
+		 * have already checked the old dir inode. So there is no need
+		 * to check whiteout.
+		 */
 	}
 
 	err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
@@ -1452,6 +1482,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
 	ubifs_release_budget(c, &req);
 
+	if (whiteout) {
+		ubifs_release_budget(c, &wht_req);
+		iput(whiteout);
+	}
+
 	mutex_lock(&old_inode_ui->ui_mutex);
 	release = old_inode_ui->dirty;
 	mark_inode_dirty_sync(old_inode);
@@ -1460,11 +1495,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (release)
 		ubifs_release_budget(c, &ino_req);
 	if (IS_SYNC(old_inode))
-		err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+		/*
+		 * Rename finished here. Although old inode cannot be updated
+		 * on flash, old ctime is not a big problem, don't return err
+		 * code to userspace.
+		 */
+		old_inode->i_sb->s_op->write_inode(old_inode, NULL);
 
 	fscrypt_free_filename(&old_nm);
 	fscrypt_free_filename(&new_nm);
-	return err;
+	return 0;
 
 out_cancel:
 	if (unlink) {
@@ -1485,11 +1525,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 				inc_nlink(old_dir);
 		}
 	}
+	unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
 	if (whiteout) {
-		drop_nlink(whiteout);
+		ubifs_release_budget(c, &wht_req);
 		iput(whiteout);
 	}
-	unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
 out_release:
 	ubifs_release_budget(c, &ino_req);
 	ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 230717384a38..72586512e51f 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1207,9 +1207,9 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
  * @sync: non-zero if the write-buffer has to be synchronized
  *
  * This function implements the re-name operation which may involve writing up
- * to 4 inodes and 2 directory entries. It marks the written inodes as clean
- * and returns zero on success. In case of failure, a negative error code is
- * returned.
+ * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes)
+ * and 2 directory entries. It marks the written inodes as clean and returns
+ * zero on success. In case of failure, a negative error code is returned.
  */
 int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		     const struct inode *old_inode,
@@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 	void *p;
 	union ubifs_key key;
 	struct ubifs_dent_node *dent, *dent2;
-	int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0;
+	int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0;
 	int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
 	int last_reference = !!(new_inode && new_inode->i_nlink == 0);
 	int move = (old_dir != new_dir);
-	struct ubifs_inode *new_ui;
+	struct ubifs_inode *new_ui, *whiteout_ui;
 	u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
 	u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
 	u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
+	u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ];
 	u8 hash_dent1[UBIFS_HASH_ARR_SZ];
 	u8 hash_dent2[UBIFS_HASH_ARR_SZ];
 
@@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 	} else
 		ilen = 0;
 
+	if (whiteout) {
+		whiteout_ui = ubifs_inode(whiteout);
+		ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex));
+		ubifs_assert(c, whiteout->i_nlink == 1);
+		ubifs_assert(c, !whiteout_ui->dirty);
+		wlen = UBIFS_INO_NODE_SZ;
+		wlen += whiteout_ui->data_len;
+	} else
+		wlen = 0;
+
 	aligned_dlen1 = ALIGN(dlen1, 8);
 	aligned_dlen2 = ALIGN(dlen2, 8);
-	len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+	len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) +
+	      ALIGN(wlen, 8) + ALIGN(plen, 8);
 	if (move)
 		len += plen;
 
@@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		p += ALIGN(ilen, 8);
 	}
 
+	if (whiteout) {
+		pack_inode(c, p, whiteout, 0);
+		err = ubifs_node_calc_hash(c, p, hash_whiteout_inode);
+		if (err)
+			goto out_release;
+
+		p += ALIGN(wlen, 8);
+	}
+
 	if (!move) {
 		pack_inode(c, p, old_dir, 1);
 		err = ubifs_node_calc_hash(c, p, hash_old_dir);
@@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		if (new_inode)
 			ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
 						  new_inode->i_ino);
+		if (whiteout)
+			ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+						  whiteout->i_ino);
 	}
 	release_head(c, BASEHD);
 
@@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm);
 		if (err)
 			goto out_ro;
-
-		ubifs_delete_orphan(c, whiteout->i_ino);
 	} else {
 		err = ubifs_add_dirt(c, lnum, dlen2);
 		if (err)
@@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		offs += ALIGN(ilen, 8);
 	}
 
+	if (whiteout) {
+		ino_key_init(c, &key, whiteout->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, wlen,
+				    hash_whiteout_inode);
+		if (err)
+			goto out_ro;
+		offs += ALIGN(wlen, 8);
+	}
+
 	ino_key_init(c, &key, old_dir->i_ino);
 	err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir);
 	if (err)
@@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		new_ui->synced_i_size = new_ui->ui_size;
 		spin_unlock(&new_ui->ui_lock);
 	}
+	/*
+	 * No need to mark whiteout inode clean.
+	 * Whiteout don't have non-zero size, no need to update
+	 * synced_i_size for whiteout_ui.
+	 */
 	mark_inode_clean(c, ubifs_inode(old_dir));
 	if (move)
 		mark_inode_clean(c, ubifs_inode(new_dir));
-- 
Gitee


From 50dc54cd74c25120c2011ae0974d7cf388f8df67 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:18 +0800
Subject: [PATCH 091/134] ubifs: Fix 'ui->dirty' race between do_tmpfile() and
 writeback work

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which
may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty',
finally dirty space is released twice.

	open(O_TMPFILE)                wb_workfn
do_tmpfile
  ubifs_budget_space(ino_req = { .dirtied_ino = 1})
  d_tmpfile // mark inode(tmpfile) dirty
  ubifs_jnl_update // without holding tmpfile's ui_mutex
    mark_inode_clean(ui)
      if (ui->dirty)
        ubifs_release_dirty_inode_budget(ui)  // release first time
                                   ubifs_write_inode
				     mutex_lock(&ui->ui_mutex)
                                     ubifs_release_dirty_inode_budget(ui)
				     // release second time
				     mutex_unlock(&ui->ui_mutex)
      ui->dirty = 0

Run generic/476 can reproduce following message easily
(See reproducer in [Link]):

  UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert
  failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554
  UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to
  read-only mode, error -22
  Workqueue: writeback wb_workfn (flush-ubifs_0_0)
  Call Trace:
    ubifs_ro_mode+0x54/0x60 [ubifs]
    ubifs_assert_failed+0x4b/0x80 [ubifs]
    ubifs_release_budget+0x468/0x5a0 [ubifs]
    ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs]
    ubifs_write_inode+0x121/0x1f0 [ubifs]
    ...
    wb_workfn+0x283/0x7b0

Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update().
Similar problem exists in whiteout renaming, but previous fix("ubifs:
Rename whiteout atomically") has solved the problem.

Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/dir.c | 60 +++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5d64d829eda1..270cc23cda47 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -396,6 +396,32 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
 	return ERR_PTR(err);
 }
 
+/**
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+}
+
+/**
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+}
+
 static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
 			 umode_t mode)
 {
@@ -403,7 +429,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
-	struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
+	struct ubifs_inode *ui;
 	int err, instantiated = 0;
 	struct fscrypt_name nm;
 
@@ -451,18 +477,18 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
 	instantiated = 1;
 	mutex_unlock(&ui->ui_mutex);
 
-	mutex_lock(&dir_ui->ui_mutex);
+	lock_2_inodes(dir, inode);
 	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
-	mutex_unlock(&dir_ui->ui_mutex);
+	unlock_2_inodes(dir, inode);
 
 	ubifs_release_budget(c, &req);
 
 	return 0;
 
 out_cancel:
-	mutex_unlock(&dir_ui->ui_mutex);
+	unlock_2_inodes(dir, inode);
 out_inode:
 	make_bad_inode(inode);
 	if (!instantiated)
@@ -689,32 +715,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
 	return 0;
 }
 
-/**
- * lock_2_inodes - a wrapper for locking two UBIFS inodes.
- * @inode1: first inode
- * @inode2: second inode
- *
- * We do not implement any tricks to guarantee strict lock ordering, because
- * VFS has already done it for us on the @i_mutex. So this is just a simple
- * wrapper function.
- */
-static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
-{
-	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
-	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-}
-
-/**
- * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
- * @inode1: first inode
- * @inode2: second inode
- */
-static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
-{
-	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
-	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
-}
-
 static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 		      struct dentry *dentry)
 {
-- 
Gitee


From c512b1dd7c983d105b8ef3ad3bcab1ecba314bf1 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:19 +0800
Subject: [PATCH 092/134] ubifs: Rectify space amount budget for mkdir/tmpfile
 operations

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

UBIFS should make sure the flash has enough space to store dirty (Data
that is newer than disk) data (in memory), space budget is exactly
designed to do that. If space budget calculates less data than we need,
'make_reservation()' will do more work(return -ENOSPC if no free space
lelf, sometimes we can see "cannot reserve xxx bytes in jhead xxx, error
-28" in ubifs error messages) with ubifs inodes locked, which may effect
other syscalls.

A simple way to decide how much space do we need when make a budget:
See how much space is needed by 'make_reservation()' in ubifs_jnl_xxx()
function according to corresponding operation.

It's better to report ENOSPC in ubifs_budget_space(), as early as we can.

Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE")
Fixes: 1e51764a3c2ac05 ("UBIFS: add new flash file system")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/dir.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 270cc23cda47..3db1c75fda5b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -427,15 +427,18 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
-	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.dirtied_ino = 1};
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
 	struct ubifs_inode *ui;
 	int err, instantiated = 0;
 	struct fscrypt_name nm;
 
 	/*
-	 * Budget request settings: new dirty inode, new direntry,
-	 * budget for dirtied inode will be released via writeback.
+	 * Budget request settings: new inode, new direntry, changing the
+	 * parent directory inode.
+	 * Allocate budget separately for new dirtied inode, the budget will
+	 * be released via writeback.
 	 */
 
 	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
@@ -977,7 +980,8 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	int err, sz_change;
-	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.dirtied_ino = 1};
 	struct fscrypt_name nm;
 
 	/*
-- 
Gitee


From d71e5f39a94f557e998f9870294e6c0070884633 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:20 +0800
Subject: [PATCH 093/134] ubifs: setflags: Make dirtied_ino_d 8 bytes aligned

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Make 'ui->data_len' aligned with 8 bytes before it is assigned to
dirtied_ino_d. Since 8871d84c8f8b0c6b("ubifs: convert to fileattr")
applied, 'setflags()' only affects regular files and directories, only
xattr inode, symlink inode and special inode(pipe/char_dev/block_dev)
have none- zero 'ui->data_len' field, so assertion
'!(req->dirtied_ino_d & 7)' cannot fail in ubifs_budget_space().
To avoid assertion fails in future evolution(eg. setflags can operate
special inodes), it's better to make dirtied_ino_d 8 bytes aligned,
after all aligned size is still zero for regular files.

Fixes: 1e51764a3c2ac05a ("UBIFS: add new flash file system")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 4363d85a3fd4..fb0fc0e6be99 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -107,7 +107,7 @@ static int setflags(struct inode *inode, int flags)
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_budget_req req = { .dirtied_ino = 1,
-					.dirtied_ino_d = ui->data_len };
+			 .dirtied_ino_d = ALIGN(ui->data_len, 8) };
 
 	err = ubifs_budget_space(c, &req);
 	if (err)
-- 
Gitee


From 99cc5a764d3fad6230d7bc3c909adf1afa600e69 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:21 +0800
Subject: [PATCH 094/134] ubifs: Fix read out-of-bounds in
 ubifs_wbuf_write_nolock()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Function ubifs_wbuf_write_nolock() may access buf out of bounds in
following process:

ubifs_wbuf_write_nolock():
  aligned_len = ALIGN(len, 8);   // Assume len = 4089, aligned_len = 4096
  if (aligned_len <= wbuf->avail) ... // Not satisfy
  if (wbuf->used) {
    ubifs_leb_write()  // Fill some data in avail wbuf
    len -= wbuf->avail;   // len is still not 8-bytes aligned
    aligned_len -= wbuf->avail;
  }
  n = aligned_len >> c->max_write_shift;
  if (n) {
    n <<= c->max_write_shift;
    err = ubifs_leb_write(c, wbuf->lnum, buf + written,
                          wbuf->offs, n);
    // n > len, read out of bounds less than 8(n-len) bytes
  }

, which can be catched by KASAN:
  =========================================================
  BUG: KASAN: slab-out-of-bounds in ecc_sw_hamming_calculate+0x1dc/0x7d0
  Read of size 4 at addr ffff888105594ff8 by task kworker/u8:4/128
  Workqueue: writeback wb_workfn (flush-ubifs_0_0)
  Call Trace:
    kasan_report.cold+0x81/0x165
    nand_write_page_swecc+0xa9/0x160
    ubifs_leb_write+0xf2/0x1b0 [ubifs]
    ubifs_wbuf_write_nolock+0x421/0x12c0 [ubifs]
    write_head+0xdc/0x1c0 [ubifs]
    ubifs_jnl_write_inode+0x627/0x960 [ubifs]
    wb_workfn+0x8af/0xb80

Function ubifs_wbuf_write_nolock() accepts that parameter 'len' is not 8
bytes aligned, the 'len' represents the true length of buf (which is
allocated in 'ubifs_jnl_xxx', eg. ubifs_jnl_write_inode), so
ubifs_wbuf_write_nolock() must handle the length read from 'buf' carefully
to write leb safely.

Fetch a reproducer in [Link].

Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214785
Reported-by: Chengsong Ke <kechengsong@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/io.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 937bd3fdad40..997450410408 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -833,16 +833,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	 */
 	n = aligned_len >> c->max_write_shift;
 	if (n) {
-		n <<= c->max_write_shift;
+		int m = n - 1;
+
 		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
 		       wbuf->offs);
-		err = ubifs_leb_write(c, wbuf->lnum, buf + written,
-				      wbuf->offs, n);
+
+		if (m) {
+			/* '(n-1)<<c->max_write_shift < len' is always true. */
+			m <<= c->max_write_shift;
+			err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+					      wbuf->offs, m);
+			if (err)
+				goto out;
+			wbuf->offs += m;
+			aligned_len -= m;
+			len -= m;
+			written += m;
+		}
+
+		/*
+		 * The non-written len of buf may be less than 'n' because
+		 * parameter 'len' is not 8 bytes aligned, so here we read
+		 * min(len, n) bytes from buf.
+		 */
+		n = 1 << c->max_write_shift;
+		memcpy(wbuf->buf, buf + written, min(len, n));
+		if (n > len) {
+			ubifs_assert(c, n - len < 8);
+			ubifs_pad(c, wbuf->buf + len, n - len);
+		}
+
+		err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
 		if (err)
 			goto out;
 		wbuf->offs += n;
 		aligned_len -= n;
-		len -= n;
+		len -= min(len, n);
 		written += n;
 	}
 
-- 
Gitee


From 4cc4bc1aa03a521e5dc08480586942de0b52c655 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:22 +0800
Subject: [PATCH 095/134] ubifs: Fix to add refcount once page is set private

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

MM defined the rule [1] very clearly that once page was set with PG_private
flag, we should increment the refcount in that page, also main flows like
pageout(), migrate_page() will assume there is one additional page
reference count if page_has_private() returns true. Otherwise, we may
get a BUG in page migration:

  page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8
  index:0xe2 pfn:0x14c12
  aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e"
  flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0|
  zone=1|lastcpupid=0x1fffff)
  page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0)
  ------------[ cut here ]------------
  kernel BUG at include/linux/page_ref.h:184!
  invalid opcode: 0000 [#1] SMP
  CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5
  RIP: 0010:migrate_page_move_mapping+0xac3/0xe70
  Call Trace:
    ubifs_migrate_page+0x22/0xc0 [ubifs]
    move_to_new_page+0xb4/0x600
    migrate_pages+0x1523/0x1cc0
    compact_zone+0x8c5/0x14b0
    kcompactd+0x2bc/0x560
    kthread+0x18c/0x1e0
    ret_from_fork+0x1f/0x30

The BUG is caused by following process:
PA(cpu 1)                           PB(cpu 2)
ubifs_write_begin
  page = grab_cache_page_write_begin
  (refcnf = 3, for page creation process)
ubifs_write_end
  SetPagePrivate(page)
  unlock_page(page)  // refcnt=3
  put_page(page)
    page_ref_dec_and_test
                                      lock(page)
                                      ...
                                      ubifs_migrate_page
                                        migrate_page_move_mapping
                                          expected_page_refs get 3
                                          (1 + mapping[1] + private[1])
                                          page_ref_freeze  // refcnt = 0
      atomic_dec_and_test(0 - 1 = -1)
                                          page_ref_unfreeze
                                          VM_BUG_ON_PAGE(-1 != 0, page)

Actually zhangjun has tried to fix this problem [2] by recalculating page
refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because
just like Kirill suggested in [2], we need to check all users of
page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting
refcount when setting/clearing private for a page. BTW, according to [4],
we set 'page->private' as 1 because ubifs just simply SetPagePrivate().
And, [5] provided a common helper to set/clear page private, ubifs can
use this helper following the example of iomap, afs, btrfs, etc.

Jump [6] to find a reproducer.

[1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com
[2] https://www.spinics.net/lists/linux-mtd/msg04018.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html
[4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.infradead.org
[5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.com
[6] https://bugzilla.kernel.org/show_bug.cgi?id=214961

Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/file.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d44c8e14810c..df46f2d3ff8b 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
 	}
 
 	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
+		attach_page_private(page, (void *)1);
 		atomic_long_inc(&c->dirty_pg_cnt);
 		__set_page_dirty_nobuffers(page);
 	}
@@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len)
 		release_existing_page_budget(c);
 
 	atomic_long_dec(&c->dirty_pg_cnt);
-	ClearPagePrivate(page);
+	detach_page_private(page);
 	ClearPageChecked(page);
 
 	kunmap(page);
@@ -1303,7 +1303,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
 		release_existing_page_budget(c);
 
 	atomic_long_dec(&c->dirty_pg_cnt);
-	ClearPagePrivate(page);
+	detach_page_private(page);
 	ClearPageChecked(page);
 }
 
@@ -1470,8 +1470,8 @@ static int ubifs_migrate_page(struct address_space *mapping,
 		return rc;
 
 	if (PagePrivate(page)) {
-		ClearPagePrivate(page);
-		SetPagePrivate(newpage);
+		detach_page_private(page);
+		attach_page_private(newpage, (void *)1);
 	}
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1495,7 +1495,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 		return 0;
 	ubifs_assert(c, PagePrivate(page));
 	ubifs_assert(c, 0);
-	ClearPagePrivate(page);
+	detach_page_private(page);
 	ClearPageChecked(page);
 	return 1;
 }
@@ -1566,7 +1566,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 	else {
 		if (!PageChecked(page))
 			ubifs_convert_page_budget(c);
-		SetPagePrivate(page);
+		attach_page_private(page, (void *)1);
 		atomic_long_inc(&c->dirty_pg_cnt);
 		__set_page_dirty_nobuffers(page);
 	}
-- 
Gitee


From 71cb569c7a438305161e0ac6b6967d410bbb7aad Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:23 +0800
Subject: [PATCH 096/134] ubi: fastmap: Return error code if memory allocation
 fails in add_aeb()

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Abort fastmap scanning and return error code if memory allocation fails
in add_aeb(). Otherwise ubi will get wrong peb statistics information
after scanning.

Fixes: dbb7d2a88d2a7b ("UBI: Add fastmap core")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/mtd/ubi/fastmap.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c
index 022af59906aa..6b5f1ffd961b 100644
--- a/drivers/mtd/ubi/fastmap.c
+++ b/drivers/mtd/ubi/fastmap.c
@@ -468,7 +468,9 @@ static int scan_pool(struct ubi_device *ubi, struct ubi_attach_info *ai,
 			if (err == UBI_IO_FF_BITFLIPS)
 				scrub = 1;
 
-			add_aeb(ai, free, pnum, ec, scrub);
+			ret = add_aeb(ai, free, pnum, ec, scrub);
+			if (ret)
+				goto out;
 			continue;
 		} else if (err == 0 || err == UBI_IO_BITFLIPS) {
 			dbg_bld("Found non empty PEB:%i in pool", pnum);
@@ -638,8 +640,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi,
 		if (fm_pos >= fm_size)
 			goto fail_bad;
 
-		add_aeb(ai, &ai->free, be32_to_cpu(fmec->pnum),
-			be32_to_cpu(fmec->ec), 0);
+		ret = add_aeb(ai, &ai->free, be32_to_cpu(fmec->pnum),
+			      be32_to_cpu(fmec->ec), 0);
+		if (ret)
+			goto fail;
 	}
 
 	/* read EC values from used list */
@@ -649,8 +653,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi,
 		if (fm_pos >= fm_size)
 			goto fail_bad;
 
-		add_aeb(ai, &used, be32_to_cpu(fmec->pnum),
-			be32_to_cpu(fmec->ec), 0);
+		ret = add_aeb(ai, &used, be32_to_cpu(fmec->pnum),
+			      be32_to_cpu(fmec->ec), 0);
+		if (ret)
+			goto fail;
 	}
 
 	/* read EC values from scrub list */
@@ -660,8 +666,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi,
 		if (fm_pos >= fm_size)
 			goto fail_bad;
 
-		add_aeb(ai, &used, be32_to_cpu(fmec->pnum),
-			be32_to_cpu(fmec->ec), 1);
+		ret = add_aeb(ai, &used, be32_to_cpu(fmec->pnum),
+			      be32_to_cpu(fmec->ec), 1);
+		if (ret)
+			goto fail;
 	}
 
 	/* read EC values from erase list */
@@ -671,8 +679,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi,
 		if (fm_pos >= fm_size)
 			goto fail_bad;
 
-		add_aeb(ai, &ai->erase, be32_to_cpu(fmec->pnum),
-			be32_to_cpu(fmec->ec), 1);
+		ret = add_aeb(ai, &ai->erase, be32_to_cpu(fmec->pnum),
+			      be32_to_cpu(fmec->ec), 1);
+		if (ret)
+			goto fail;
 	}
 
 	ai->mean_ec = div_u64(ai->ec_sum, ai->ec_count);
-- 
Gitee


From 83919316f903d4c261169abe19dd5e1cce0e0b3a Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:24 +0800
Subject: [PATCH 097/134] ubi: fastmap: Add all fastmap pebs into 'ai->fastmap'
 when fm->used_blocks>=2

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Fastmap pebs(pnum >= UBI_FM_MAX_START) won't be added into 'ai->fastmap'
while attaching ubi device if 'fm->used_blocks' is greater than 2, which
may cause warning from 'ubi_assert(ubi->good_peb_count == found_pebs)':

  UBI assert failed in ubi_wl_init at 1878 (pid 2409)
  Call Trace:
    ubi_wl_init.cold+0xae/0x2af [ubi]
    ubi_attach+0x1b0/0x780 [ubi]
    ubi_init+0x23a/0x3ad [ubi]
    load_module+0x22d2/0x2430

Reproduce:
  ID="0x20,0x33,0x00,0x00" # 16M 16KB PEB, 512 page
  modprobe nandsim id_bytes=$ID
  modprobe ubi mtd="0,0" fm_autoconvert  # Fastmap takes 2 pebs
  rmmod ubi
  modprobe ubi mtd="0,0" fm_autoconvert  # Attach by fastmap

Add all fastmap pebs into list 'ai->fastmap' to make sure they can be
counted into 'found_pebs'.

Fixes: fdf10ed710c0aa ("ubi: Rework Fastmap attach base code")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/mtd/ubi/fastmap.c | 41 ++++++++++++++-------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c
index 6b5f1ffd961b..88fdf8f5709f 100644
--- a/drivers/mtd/ubi/fastmap.c
+++ b/drivers/mtd/ubi/fastmap.c
@@ -828,24 +828,6 @@ static int find_fm_anchor(struct ubi_attach_info *ai)
 	return ret;
 }
 
-static struct ubi_ainf_peb *clone_aeb(struct ubi_attach_info *ai,
-				      struct ubi_ainf_peb *old)
-{
-	struct ubi_ainf_peb *new;
-
-	new = ubi_alloc_aeb(ai, old->pnum, old->ec);
-	if (!new)
-		return NULL;
-
-	new->vol_id = old->vol_id;
-	new->sqnum = old->sqnum;
-	new->lnum = old->lnum;
-	new->scrub = old->scrub;
-	new->copy_flag = old->copy_flag;
-
-	return new;
-}
-
 /**
  * ubi_scan_fastmap - scan the fastmap.
  * @ubi: UBI device object
@@ -875,15 +857,11 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai,
 	if (fm_anchor < 0)
 		return UBI_NO_FASTMAP;
 
-	/* Copy all (possible) fastmap blocks into our new attach structure. */
+	/* Add fastmap blocks(pnum < UBI_FM_MAX_START) into attach structure. */
 	list_for_each_entry(aeb, &scan_ai->fastmap, u.list) {
-		struct ubi_ainf_peb *new;
-
-		new = clone_aeb(ai, aeb);
-		if (!new)
-			return -ENOMEM;
-
-		list_add(&new->u.list, &ai->fastmap);
+		ret = add_aeb(ai, &ai->fastmap, aeb->pnum, aeb->ec, 0);
+		if (ret)
+			return ret;
 	}
 
 	down_write(&ubi->fm_protect);
@@ -1029,6 +1007,17 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai,
 				"err: %i)", i, pnum, ret);
 			goto free_hdr;
 		}
+
+		/*
+		 * Add left fastmap blocks (pnum >= UBI_FM_MAX_START) into
+		 * attach structure.
+		 */
+		if (pnum >= UBI_FM_MAX_START) {
+			ret = add_aeb(ai, &ai->fastmap, pnum,
+				      be64_to_cpu(ech->ec), 0);
+			if (ret)
+				goto free_hdr;
+		}
 	}
 
 	kfree(fmsb);
-- 
Gitee


From 1423dc67e94e9ba71a3120738c171baaa720d7fd Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Mon, 15 Nov 2021 19:52:26 +0800
Subject: [PATCH 098/134] ubi: fix race condition between ctrl_cdev_ioctl and
 ubi_cdev_ioctl

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Hulk Robot reported a KASAN report about use-after-free:
 ==================================================================
 BUG: KASAN: use-after-free in __list_del_entry_valid+0x13d/0x160
 Read of size 8 at addr ffff888035e37d98 by task ubiattach/1385
 [...]
 Call Trace:
  klist_dec_and_del+0xa7/0x4a0
  klist_put+0xc7/0x1a0
  device_del+0x4d4/0xed0
  cdev_device_del+0x1a/0x80
  ubi_attach_mtd_dev+0x2951/0x34b0 [ubi]
  ctrl_cdev_ioctl+0x286/0x2f0 [ubi]

 Allocated by task 1414:
  device_add+0x60a/0x18b0
  cdev_device_add+0x103/0x170
  ubi_create_volume+0x1118/0x1a10 [ubi]
  ubi_cdev_ioctl+0xb7f/0x1ba0 [ubi]

 Freed by task 1385:
  cdev_device_del+0x1a/0x80
  ubi_remove_volume+0x438/0x6c0 [ubi]
  ubi_cdev_ioctl+0xbf4/0x1ba0 [ubi]
 [...]
 ==================================================================

The lock held by ctrl_cdev_ioctl is ubi_devices_mutex, but the lock held
by ubi_cdev_ioctl is ubi->device_mutex. Therefore, the two locks can be
concurrent.

ctrl_cdev_ioctl contains two operations: ubi_attach and ubi_detach.
ubi_detach is bug-free because it uses reference counting to prevent
concurrency. However, uif_init and uif_close in ubi_attach may race with
ubi_cdev_ioctl.

uif_init will race with ubi_cdev_ioctl as in the following stack.
           cpu1                   cpu2                  cpu3
_______________________|________________________|______________________
ctrl_cdev_ioctl
 ubi_attach_mtd_dev
  uif_init
                           ubi_cdev_ioctl
                            ubi_create_volume
                             cdev_device_add
   ubi_add_volume
   // sysfs exist
   kill_volumes
                                                    ubi_cdev_ioctl
                                                     ubi_remove_volume
                                                      cdev_device_del
                                                       // first free
    ubi_free_volume
     cdev_del
     // double free
   cdev_device_del

And uif_close will race with ubi_cdev_ioctl as in the following stack.
           cpu1                   cpu2                  cpu3
_______________________|________________________|______________________
ctrl_cdev_ioctl
 ubi_attach_mtd_dev
  uif_init
                           ubi_cdev_ioctl
                            ubi_create_volume
                             cdev_device_add
  ubi_debugfs_init_dev
  //error goto out_uif;
  uif_close
   kill_volumes
                                                    ubi_cdev_ioctl
                                                     ubi_remove_volume
                                                      cdev_device_del
                                                       // first free
    ubi_free_volume
    // double free

The cause of this problem is that commit 714fb87e8bc0 make device
"available" before it becomes accessible via sysfs. Therefore, we
roll back the modification. We will fix the race condition between
ubi device creation and udev by removing ubi_get_device in
vol_attribute_show and dev_attribute_show.This avoids accessing
uninitialized ubi_devices[ubi_num].

ubi_get_device is used to prevent devices from being deleted during
sysfs execution. However, now kernfs ensures that devices will not
be deleted before all reference counting are released.
The key process is shown in the following stack.

device_del
  device_remove_attrs
    device_remove_groups
      sysfs_remove_groups
        sysfs_remove_group
          remove_files
            kernfs_remove_by_name
              kernfs_remove_by_name_ns
                __kernfs_remove
                  kernfs_drain

Fixes: 714fb87e8bc0 ("ubi: Fix race condition between ubi device creation and udev")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/mtd/ubi/build.c | 9 +--------
 drivers/mtd/ubi/vmt.c   | 8 +-------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index e85b04e9716b..4153e0d15c5f 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -350,9 +350,6 @@ static ssize_t dev_attribute_show(struct device *dev,
 	 * we still can use 'ubi->ubi_num'.
 	 */
 	ubi = container_of(dev, struct ubi_device, dev);
-	ubi = ubi_get_device(ubi->ubi_num);
-	if (!ubi)
-		return -ENODEV;
 
 	if (attr == &dev_eraseblock_size)
 		ret = sprintf(buf, "%d\n", ubi->leb_size);
@@ -381,7 +378,6 @@ static ssize_t dev_attribute_show(struct device *dev,
 	else
 		ret = -EINVAL;
 
-	ubi_put_device(ubi);
 	return ret;
 }
 
@@ -980,9 +976,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
 			goto out_detach;
 	}
 
-	/* Make device "available" before it becomes accessible via sysfs */
-	ubi_devices[ubi_num] = ubi;
-
 	err = uif_init(ubi);
 	if (err)
 		goto out_detach;
@@ -1027,6 +1020,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
 	wake_up_process(ubi->bgt_thread);
 	spin_unlock(&ubi->wl_lock);
 
+	ubi_devices[ubi_num] = ubi;
 	ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL);
 	return ubi_num;
 
@@ -1035,7 +1029,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
 out_uif:
 	uif_close(ubi);
 out_detach:
-	ubi_devices[ubi_num] = NULL;
 	ubi_wl_close(ubi);
 	ubi_free_all_volumes(ubi);
 	vfree(ubi->vtbl);
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c
index 139ee132bfbc..1bc7b3a05604 100644
--- a/drivers/mtd/ubi/vmt.c
+++ b/drivers/mtd/ubi/vmt.c
@@ -56,16 +56,11 @@ static ssize_t vol_attribute_show(struct device *dev,
 {
 	int ret;
 	struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev);
-	struct ubi_device *ubi;
-
-	ubi = ubi_get_device(vol->ubi->ubi_num);
-	if (!ubi)
-		return -ENODEV;
+	struct ubi_device *ubi = vol->ubi;
 
 	spin_lock(&ubi->volumes_lock);
 	if (!ubi->volumes[vol->vol_id]) {
 		spin_unlock(&ubi->volumes_lock);
-		ubi_put_device(ubi);
 		return -ENODEV;
 	}
 	/* Take a reference to prevent volume removal */
@@ -103,7 +98,6 @@ static ssize_t vol_attribute_show(struct device *dev,
 	vol->ref_count -= 1;
 	ubi_assert(vol->ref_count >= 0);
 	spin_unlock(&ubi->volumes_lock);
-	ubi_put_device(ubi);
 	return ret;
 }
 
-- 
Gitee


From b41bd3d6257214d31b5cb2defed92679b58f29ec Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea@microchip.com>
Date: Mon, 15 Nov 2021 19:52:27 +0800
Subject: [PATCH 099/134] regulator: core: do not continue if selector match

mainline inclusion
from mainline-v5.11-rc1
commit ab97800e088acf34d0014845ed93605dd5c1ea2a
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

Do not continue if selector has already been located.

Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Link: https://lore.kernel.org/r/1605290164-11556-1-git-send-email-claudiu.beznea@microchip.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Guan Jing <guanjing6@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/regulator/core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 043b5f63b94a..eead157609e5 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -4058,6 +4058,10 @@ int regulator_set_voltage_time(struct regulator *regulator,
 
 	for (i = 0; i < rdev->desc->n_voltages; i++) {
 		/* We only look for exact voltage matches here */
+
+		if (old_sel >= 0 && new_sel >= 0)
+			break;
+
 		voltage = regulator_list_voltage(regulator, i);
 		if (voltage < 0)
 			return -EINVAL;
-- 
Gitee


From 0a61e109955311a3f6cd6da224cbfa993a5b842c Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Mon, 15 Nov 2021 19:52:28 +0800
Subject: [PATCH 100/134] regulator: core: Respect off_on_delay at startup

mainline inclusion
from mainline-v5.13-rc1
commit a5ccccb3ec0b052804d03df90c0d08689be54170
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

We currently do not respect off_on_delay the first time we turn on a
regulator.  This is problematic since the regulator could have been
turned off by the bootloader, or it could it have been turned off during
the probe of the regulator driver (such as when regulator-fixed requests
the enable GPIO), either of which could potentially have happened less
than off_on_delay microseconds ago before the first time a client
requests for the regulator to be turned on.

We can't know exactly when the regulator was turned off, but initialise
off_on_delay to the current time when registering the regulator, so that
we guarantee that we respect the off_on_delay in all cases.

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Link: https://lore.kernel.org/r/20210422083044.11479-1-vincent.whitchurch@axis.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Guan Jing <guanjing6@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/regulator/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index eead157609e5..a6d27334a71d 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1445,6 +1445,8 @@ static int set_machine_constraints(struct regulator_dev *rdev)
 
 		if (rdev->constraints->always_on)
 			rdev->use_count++;
+	} else if (rdev->desc->off_on_delay) {
+		rdev->last_off_jiffy = jiffies;
 	}
 
 	print_constraints(rdev);
-- 
Gitee


From e05e08ee3100ba69d005446779f2cef261f6b0d4 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 15 Nov 2021 19:52:29 +0800
Subject: [PATCH 101/134] iio: core: Allow drivers to specify a label without
 it coming from of

mainline inclusion
from mainline-v5.13-rc1
commit 6f71bf1991b6f04dc87a4f5b9d6823535f51a50d
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

Only set indio_dev->label from of/dt if there actually is a label
specified in of.

This allows drivers to set a label without this being overwritten with
NULL when there is no label specified in of. This is esp. useful on
devices where of is not used at all, such as your typical x86/ACPI device.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210207160901.110643-2-hdegoede@redhat.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: tanghui <tanghui20@huawei.com>
Reviewed-by: Zhang Qiao <zhangqiao22@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/iio/industrialio-core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 261d3b17edc9..ea98aad9fb81 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1720,6 +1720,7 @@ static const struct iio_buffer_setup_ops noop_ring_setup_ops;
 
 int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 {
+	const char *label;
 	int ret;
 
 	if (!indio_dev->info)
@@ -1730,8 +1731,9 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 	if (!indio_dev->dev.of_node && indio_dev->dev.parent)
 		indio_dev->dev.of_node = indio_dev->dev.parent->of_node;
 
-	indio_dev->label = of_get_property(indio_dev->dev.of_node, "label",
-					   NULL);
+	label = of_get_property(indio_dev->dev.of_node, "label", NULL);
+	if (label)
+		indio_dev->label = label;
 
 	ret = iio_check_unique_scan_index(indio_dev);
 	if (ret < 0)
-- 
Gitee


From d5f3427bbd432b2a985c0e03bc103a7edd282aec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa@analog.com>
Date: Mon, 15 Nov 2021 19:52:30 +0800
Subject: [PATCH 102/134] iio: buffer: Return error if no callback is given
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mainline inclusion
from mainline-v5.11-rc1
commit 6d74a3ee1ee1c7b62de656c26d370448ed5885c3
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

Return error in case no callback is provided to
`iio_channel_get_all_cb()`. There's no point in setting up a buffer-cb
if no callback is provided.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Olivier Moysan <olivier.moysan@st.com>
Link: https://lore.kernel.org/r/20201121161457.957-3-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: tanghui <tanghui20@huawei.com>
Reviewed-by: Zhang Qiao <zhangqiao22@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-cb.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iio/buffer/industrialio-buffer-cb.c b/drivers/iio/buffer/industrialio-buffer-cb.c
index 47c96f7f4976..4c12b7a94af5 100644
--- a/drivers/iio/buffer/industrialio-buffer-cb.c
+++ b/drivers/iio/buffer/industrialio-buffer-cb.c
@@ -54,6 +54,11 @@ struct iio_cb_buffer *iio_channel_get_all_cb(struct device *dev,
 	struct iio_cb_buffer *cb_buff;
 	struct iio_channel *chan;
 
+	if (!cb) {
+		dev_err(dev, "Invalid arguments: A callback must be provided!\n");
+		return ERR_PTR(-EINVAL);
+	}
+
 	cb_buff = kzalloc(sizeof(*cb_buff), GFP_KERNEL);
 	if (cb_buff == NULL)
 		return ERR_PTR(-ENOMEM);
-- 
Gitee


From b4ee88f06395e3cb1d6c14593a162fcfdb9c7929 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 15 Nov 2021 19:53:30 +0800
Subject: [PATCH 103/134] blk-mq: don't free tags if the tag_set is used by
 other device in queue initialztion

mainline inclusion
from mainline-v5.16
commit a846a8e6c9a5949582c5a6a8bbc83a7d27fd891e
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a846a8e6c9a5949582c5a6a8bbc83a7d27fd891e

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

We got UAF report on v5.10 as follows:
[ 1446.674930] ==================================================================
[ 1446.675970] BUG: KASAN: use-after-free in blk_mq_get_driver_tag+0x9a4/0xa90
[ 1446.676902] Read of size 8 at addr ffff8880185afd10 by task kworker/1:2/12348
[ 1446.677851]
[ 1446.678073] CPU: 1 PID: 12348 Comm: kworker/1:2 Not tainted 5.10.0-10177-gc9c81b1e346a #2
[ 1446.679168] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 1446.680692] Workqueue: kthrotld blk_throtl_dispatch_work_fn
[ 1446.681448] Call Trace:
[ 1446.681800]  dump_stack+0x9b/0xce
[ 1446.682916]  print_address_description.constprop.6+0x3e/0x60
[ 1446.685999]  kasan_report.cold.9+0x22/0x3a
[ 1446.687186]  blk_mq_get_driver_tag+0x9a4/0xa90
[ 1446.687785]  blk_mq_dispatch_rq_list+0x21a/0x1d40
[ 1446.692576]  __blk_mq_do_dispatch_sched+0x394/0x830
[ 1446.695758]  __blk_mq_sched_dispatch_requests+0x398/0x4f0
[ 1446.698279]  blk_mq_sched_dispatch_requests+0xdf/0x140
[ 1446.698967]  __blk_mq_run_hw_queue+0xc0/0x270
[ 1446.699561]  __blk_mq_delay_run_hw_queue+0x4cc/0x550
[ 1446.701407]  blk_mq_run_hw_queue+0x13b/0x2b0
[ 1446.702593]  blk_mq_sched_insert_requests+0x1de/0x390
[ 1446.703309]  blk_mq_flush_plug_list+0x4b4/0x760
[ 1446.705408]  blk_flush_plug_list+0x2c5/0x480
[ 1446.708471]  blk_finish_plug+0x55/0xa0
[ 1446.708980]  blk_throtl_dispatch_work_fn+0x23b/0x2e0
[ 1446.711236]  process_one_work+0x6d4/0xfe0
[ 1446.711778]  worker_thread+0x91/0xc80
[ 1446.713400]  kthread+0x32d/0x3f0
[ 1446.714362]  ret_from_fork+0x1f/0x30
[ 1446.714846]
[ 1446.715062] Allocated by task 1:
[ 1446.715509]  kasan_save_stack+0x19/0x40
[ 1446.716026]  __kasan_kmalloc.constprop.1+0xc1/0xd0
[ 1446.716673]  blk_mq_init_tags+0x6d/0x330
[ 1446.717207]  blk_mq_alloc_rq_map+0x50/0x1c0
[ 1446.717769]  __blk_mq_alloc_map_and_request+0xe5/0x320
[ 1446.718459]  blk_mq_alloc_tag_set+0x679/0xdc0
[ 1446.719050]  scsi_add_host_with_dma.cold.3+0xa0/0x5db
[ 1446.719736]  virtscsi_probe+0x7bf/0xbd0
[ 1446.720265]  virtio_dev_probe+0x402/0x6c0
[ 1446.720808]  really_probe+0x276/0xde0
[ 1446.721320]  driver_probe_device+0x267/0x3d0
[ 1446.721892]  device_driver_attach+0xfe/0x140
[ 1446.722491]  __driver_attach+0x13a/0x2c0
[ 1446.723037]  bus_for_each_dev+0x146/0x1c0
[ 1446.723603]  bus_add_driver+0x3fc/0x680
[ 1446.724145]  driver_register+0x1c0/0x400
[ 1446.724693]  init+0xa2/0xe8
[ 1446.725091]  do_one_initcall+0x9e/0x310
[ 1446.725626]  kernel_init_freeable+0xc56/0xcb9
[ 1446.726231]  kernel_init+0x11/0x198
[ 1446.726714]  ret_from_fork+0x1f/0x30
[ 1446.727212]
[ 1446.727433] Freed by task 26992:
[ 1446.727882]  kasan_save_stack+0x19/0x40
[ 1446.728420]  kasan_set_track+0x1c/0x30
[ 1446.728943]  kasan_set_free_info+0x1b/0x30
[ 1446.729517]  __kasan_slab_free+0x111/0x160
[ 1446.730084]  kfree+0xb8/0x520
[ 1446.730507]  blk_mq_free_map_and_requests+0x10b/0x1b0
[ 1446.731206]  blk_mq_realloc_hw_ctxs+0x8cb/0x15b0
[ 1446.731844]  blk_mq_init_allocated_queue+0x374/0x1380
[ 1446.732540]  blk_mq_init_queue_data+0x7f/0xd0
[ 1446.733155]  scsi_mq_alloc_queue+0x45/0x170
[ 1446.733730]  scsi_alloc_sdev+0x73c/0xb20
[ 1446.734281]  scsi_probe_and_add_lun+0x9a6/0x2d90
[ 1446.734916]  __scsi_scan_target+0x208/0xc50
[ 1446.735500]  scsi_scan_channel.part.3+0x113/0x170
[ 1446.736149]  scsi_scan_host_selected+0x25a/0x360
[ 1446.736783]  store_scan+0x290/0x2d0
[ 1446.737275]  dev_attr_store+0x55/0x80
[ 1446.737782]  sysfs_kf_write+0x132/0x190
[ 1446.738313]  kernfs_fop_write_iter+0x319/0x4b0
[ 1446.738921]  new_sync_write+0x40e/0x5c0
[ 1446.739429]  vfs_write+0x519/0x720
[ 1446.739877]  ksys_write+0xf8/0x1f0
[ 1446.740332]  do_syscall_64+0x2d/0x40
[ 1446.740802]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1446.741462]
[ 1446.741670] The buggy address belongs to the object at ffff8880185afd00
[ 1446.741670]  which belongs to the cache kmalloc-256 of size 256
[ 1446.743276] The buggy address is located 16 bytes inside of
[ 1446.743276]  256-byte region [ffff8880185afd00, ffff8880185afe00)
[ 1446.744765] The buggy address belongs to the page:
[ 1446.745416] page:ffffea0000616b00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x185ac
[ 1446.746694] head:ffffea0000616b00 order:2 compound_mapcount:0 compound_pincount:0
[ 1446.747719] flags: 0x1fffff80010200(slab|head)
[ 1446.748337] raw: 001fffff80010200 ffffea00006a3208 ffffea000061bf08 ffff88801004f240
[ 1446.749404] raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
[ 1446.750455] page dumped because: kasan: bad access detected
[ 1446.751227]
[ 1446.751445] Memory state around the buggy address:
[ 1446.752102]  ffff8880185afc00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.753090]  ffff8880185afc80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.754079] >ffff8880185afd00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1446.755065]                          ^
[ 1446.755589]  ffff8880185afd80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1446.756574]  ffff8880185afe00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.757566] ==================================================================

Flag 'BLK_MQ_F_TAG_QUEUE_SHARED' will be set if the second device on the
same host initializes it's queue successfully. However, if the second
device failed to allocate memory in blk_mq_alloc_and_init_hctx() from
blk_mq_realloc_hw_ctxs() from blk_mq_init_allocated_queue(),
__blk_mq_free_map_and_rqs() will be called on error path, and if
'BLK_MQ_TAG_HCTX_SHARED' is not set, 'tag_set->tags' will be freed
while it's still used by the first device.

To fix this issue we move release newly allocated hardware context from
blk_mq_realloc_hw_ctxs to __blk_mq_update_nr_hw_queues. As there is needn't to
release hardware context in blk_mq_init_allocated_queue.

Fixes: 868f2f0b7206 ("blk-mq: dynamic h/w context count")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211108074019.1058843-1-yebin10@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

conflicts:
block/blk-mq.c

Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-mq.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b0e7ec85a41c..70301cc6b18d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3239,8 +3239,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx = hctxs[j];
 
 		if (hctx) {
-			if (hctx->tags)
-				blk_mq_free_map_and_requests(set, j);
 			blk_mq_exit_hctx(q, set, hctx, j);
 			hctxs[j] = NULL;
 		}
@@ -3726,8 +3724,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_realloc_hw_ctxs(set, q);
 		if (q->nr_hw_queues != set->nr_hw_queues) {
+			int i = prev_nr_hw_queues;
+
 			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
 					nr_hw_queues, prev_nr_hw_queues);
+			for (; i < set->nr_hw_queues; i++)
+				blk_mq_free_map_and_requests(set, i);
+
 			set->nr_hw_queues = prev_nr_hw_queues;
 			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
 			goto fallback;
-- 
Gitee


From 9b4f9af2848894f11d76a5f7d2b7b9378a247c33 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 15 Nov 2021 19:53:31 +0800
Subject: [PATCH 104/134] ARM: 9057/1: cache-v7: add missing ISB after cache
 level selection

mainline inclusion
from mainline-v5.15-rc2
commit c0e50736e826b51ddc437e6cf0dc68f07e4ad16b
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0e50736e826b51ddc437e6cf0dc68f07e4ad16b

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------

A write to CSSELR needs to complete before its results can be observed
via CCSIDR. So add a ISB to ensure that this is the case.

Acked-by: Nicolas Pitre <nico@fluxnic.net>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: He Ying <heying24@huawei.com>
Reviewed-by: Liao Chang <liaochang1@huawei.com>
Reviewed-by: Li Wei <liwei391@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 arch/arm/mm/cache-v7.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index dc8f152f3556..307f381eee71 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -38,9 +38,10 @@ icache_size:
  * procedures.
  */
 ENTRY(v7_invalidate_l1)
-       mov     r0, #0
-       mcr     p15, 2, r0, c0, c0, 0
-       mrc     p15, 1, r0, c0, c0, 0
+	mov	r0, #0
+	mcr	p15, 2, r0, c0, c0, 0	@ select L1 data cache in CSSELR
+	isb
+	mrc	p15, 1, r0, c0, c0, 0	@ read cache geometry from CCSIDR
 
        movw    r1, #0x7fff
        and     r2, r1, r0, lsr #13
-- 
Gitee


From c7cf208e5ef89118d70f3f2ea0cdbb84eef8ee99 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:32 +0800
Subject: [PATCH 105/134] nvme: add APIs for stopping/starting admin queue

mainline inclusion
from mainline-v5.16
commit a277654bafb51fb8b4cf23550f15926bb02536f4
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a277654bafb51fb8b4cf23550f15926bb02536f4

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Add two APIs for stopping and starting admin queue.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211014081710.1871747-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/nvme/host/core.c | 12 ++++++++++++
 drivers/nvme/host/nvme.h |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 99b5152482fe..509a44ff0ce5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4634,6 +4634,18 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
+{
+	blk_mq_quiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
+
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
+{
+	blk_mq_unquiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
+
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 5dd1dd8021ba..0cad04335e34 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -647,6 +647,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
 void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
-- 
Gitee


From 3f26ab0b99bc2d569db4e4fbc33a7bedafb768f4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:33 +0800
Subject: [PATCH 106/134] nvme: apply nvme API to quiesce/unquiesce admin queue

mainline inclusion
from mainline-v5.16
commit 6ca1d9027e0d9ce5604a3e28de89456a76138034
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6ca1d9027e0d9ce5604a3e28de89456a76138034

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Apply the added two APIs to quiesce/unquiesce admin queue.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211014081710.1871747-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflict: commit f21c4769d0de ("nvme: rename nvme_init_identify()")
is not backported
 - drivers/nvme/host/rdma.c
 - drivers/nvme/host/tcp.c
 - drivers/nvme/target/loop.c
 - drivers/nvme/host/fc.c
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/nvme/host/core.c   |  2 +-
 drivers/nvme/host/fc.c     |  8 ++++----
 drivers/nvme/host/pci.c    |  8 ++++----
 drivers/nvme/host/rdma.c   | 14 +++++++-------
 drivers/nvme/host/tcp.c    | 16 ++++++++--------
 drivers/nvme/target/loop.c |  4 ++--
 6 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 509a44ff0ce5..dd425ae68c07 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4555,7 +4555,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 
 	/* Forcibly unquiesce queues to avoid blocking dispatch */
 	if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
-		blk_mq_unquiesce_queue(ctrl->admin_q);
+		nvme_start_admin_queue(ctrl);
 
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		nvme_set_queue_dying(ns);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 906cab35afe7..b534a85e2bf1 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2381,7 +2381,7 @@ nvme_fc_ctrl_free(struct kref *ref)
 	list_del(&ctrl->ctrl_list);
 	spin_unlock_irqrestore(&ctrl->rport->lock, flags);
 
-	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_start_admin_queue(&ctrl->ctrl);
 	blk_cleanup_queue(ctrl->ctrl.admin_q);
 	blk_cleanup_queue(ctrl->ctrl.fabrics_q);
 	blk_mq_free_tag_set(&ctrl->admin_tag_set);
@@ -2509,7 +2509,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
 	/*
 	 * clean up the admin queue. Same thing as above.
 	 */
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_stop_admin_queue(&ctrl->ctrl);
 	blk_sync_queue(ctrl->ctrl.admin_q);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_fc_terminate_exchange, &ctrl->ctrl);
@@ -3098,7 +3098,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
 						(ilog2(SZ_4K) - 9);
 
-	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_start_admin_queue(&ctrl->ctrl);
 
 	ret = nvme_init_identify(&ctrl->ctrl);
 	if (ret || test_bit(ASSOC_FAILED, &ctrl->flags))
@@ -3245,7 +3245,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 	nvme_fc_free_queue(&ctrl->queues[0]);
 
 	/* re-enable the admin_q so anything new can fast fail */
-	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_start_admin_queue(&ctrl->ctrl);
 
 	/* resume the io queues so that things will fast fail */
 	nvme_start_queues(&ctrl->ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 1b85349f57af..224907d8d5dc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1408,7 +1408,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 
 	nvmeq->dev->online_queues--;
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
-		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
+		nvme_stop_admin_queue(&nvmeq->dev->ctrl);
 	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
 		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
 	return 0;
@@ -1640,7 +1640,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
 		 * user requests may be waiting on a stopped queue. Start the
 		 * queue to flush these to completion.
 		 */
-		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+		nvme_start_admin_queue(&dev->ctrl);
 		blk_cleanup_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
@@ -1674,7 +1674,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 			return -ENODEV;
 		}
 	} else
-		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+		nvme_start_admin_queue(&dev->ctrl);
 
 	return 0;
 }
@@ -2516,7 +2516,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	if (shutdown) {
 		nvme_start_queues(&dev->ctrl);
 		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
-			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+			nvme_start_admin_queue(&dev->ctrl);
 	}
 	mutex_unlock(&dev->shutdown_lock);
 }
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 51f4647ea214..5e2c86adeb27 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -918,7 +918,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 	else
 		ctrl->ctrl.max_integrity_segments = 0;
 
-	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_start_admin_queue(&ctrl->ctrl);
 
 	error = nvme_init_identify(&ctrl->ctrl);
 	if (error)
@@ -927,7 +927,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 	return 0;
 
 out_quiesce_queue:
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_stop_admin_queue(&ctrl->ctrl);
 	blk_sync_queue(ctrl->ctrl.admin_q);
 out_stop_queue:
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
@@ -1025,7 +1025,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		bool remove)
 {
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_stop_admin_queue(&ctrl->ctrl);
 	blk_sync_queue(ctrl->ctrl.admin_q);
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	if (ctrl->ctrl.admin_tagset) {
@@ -1034,7 +1034,7 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
 	}
 	if (remove)
-		blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+		nvme_start_admin_queue(&ctrl->ctrl);
 	nvme_rdma_destroy_admin_queue(ctrl, remove);
 }
 
@@ -1161,7 +1161,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
 		nvme_rdma_destroy_io_queues(ctrl, new);
 	}
 destroy_admin:
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_stop_admin_queue(&ctrl->ctrl);
 	blk_sync_queue(ctrl->ctrl.admin_q);
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	nvme_cancel_admin_tagset(&ctrl->ctrl);
@@ -1201,7 +1201,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 	nvme_rdma_teardown_io_queues(ctrl, false);
 	nvme_start_queues(&ctrl->ctrl);
 	nvme_rdma_teardown_admin_queue(ctrl, false);
-	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_start_admin_queue(&ctrl->ctrl);
 
 	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
 		/* state change failure is ok if we started ctrl delete */
@@ -2237,7 +2237,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 	cancel_delayed_work_sync(&ctrl->reconnect_work);
 
 	nvme_rdma_teardown_io_queues(ctrl, shutdown);
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_stop_admin_queue(&ctrl->ctrl);
 	if (shutdown)
 		nvme_shutdown_ctrl(&ctrl->ctrl);
 	else
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index e99d43989418..c014c5adbac5 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1889,7 +1889,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
 	if (error)
 		goto out_stop_queue;
 
-	blk_mq_unquiesce_queue(ctrl->admin_q);
+	nvme_start_admin_queue(ctrl);
 
 	error = nvme_init_identify(ctrl);
 	if (error)
@@ -1898,7 +1898,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
 	return 0;
 
 out_quiesce_queue:
-	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_stop_admin_queue(ctrl);
 	blk_sync_queue(ctrl->admin_q);
 out_stop_queue:
 	nvme_tcp_stop_queue(ctrl, 0);
@@ -1920,7 +1920,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
 		bool remove)
 {
-	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_stop_admin_queue(ctrl);
 	blk_sync_queue(ctrl->admin_q);
 	nvme_tcp_stop_queue(ctrl, 0);
 	if (ctrl->admin_tagset) {
@@ -1929,7 +1929,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
 		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
 	}
 	if (remove)
-		blk_mq_unquiesce_queue(ctrl->admin_q);
+		nvme_start_admin_queue(ctrl);
 	nvme_tcp_destroy_admin_queue(ctrl, remove);
 }
 
@@ -1938,7 +1938,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
 {
 	if (ctrl->queue_count <= 1)
 		return;
-	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_stop_admin_queue(ctrl);
 	nvme_start_freeze(ctrl);
 	nvme_stop_queues(ctrl);
 	nvme_sync_io_queues(ctrl);
@@ -2030,7 +2030,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
 		nvme_tcp_destroy_io_queues(ctrl, new);
 	}
 destroy_admin:
-	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_stop_admin_queue(ctrl);
 	blk_sync_queue(ctrl->admin_q);
 	nvme_tcp_stop_queue(ctrl, 0);
 	nvme_cancel_admin_tagset(ctrl);
@@ -2073,7 +2073,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 	/* unquiesce to fail fast pending requests */
 	nvme_start_queues(ctrl);
 	nvme_tcp_teardown_admin_queue(ctrl, false);
-	blk_mq_unquiesce_queue(ctrl->admin_q);
+	nvme_start_admin_queue(ctrl);
 
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
 		/* state change failure is ok if we started ctrl delete */
@@ -2091,7 +2091,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
 
 	nvme_tcp_teardown_io_queues(ctrl, shutdown);
-	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_stop_admin_queue(ctrl);
 	if (shutdown)
 		nvme_shutdown_ctrl(ctrl);
 	else
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index e17710e405a3..5b8a38c9475e 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -396,7 +396,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
 	ctrl->ctrl.max_hw_sectors =
 		(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
 
-	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_start_admin_queue(&ctrl->ctrl);
 
 	error = nvme_init_identify(&ctrl->ctrl);
 	if (error)
@@ -426,7 +426,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 		nvme_loop_destroy_io_queues(ctrl);
 	}
 
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_stop_admin_queue(&ctrl->ctrl);
 	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
 		nvme_shutdown_ctrl(&ctrl->ctrl);
 
-- 
Gitee


From 99f0a498b96a7262d259450e0b54f5318453950b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:34 +0800
Subject: [PATCH 107/134] nvme: prepare for pairing quiescing and unquiescing

mainline inclusion
from mainline-v5.16
commit ebc9b95260151d966728cf0063b3b4e465f934d9
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ebc9b95260151d966728cf0063b3b4e465f934d9

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Add two helpers so that we can prepare for pairing quiescing and
unquiescing which will be done in next patch.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211014081710.1871747-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

conflict: commit d17e66aadbe5 ("nvme: use set_capacity_and_notify in
nvme_set_queue_dying") is not backported.
 - drivers/nvme/host/core.c
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/nvme/host/core.c | 54 ++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dd425ae68c07..cdea98169131 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -102,26 +102,6 @@ static void nvme_update_bdev_size(struct gendisk *disk)
 	}
 }
 
-/*
- * Prepare a queue for teardown.
- *
- * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
- * the capacity to 0 after that to avoid blocking dispatchers that may be
- * holding bd_butex.  This will end buffered writers dirtying pages that can't
- * be synced.
- */
-static void nvme_set_queue_dying(struct nvme_ns *ns)
-{
-	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
-		return;
-
-	blk_set_queue_dying(ns->queue);
-	blk_mq_unquiesce_queue(ns->queue);
-
-	set_capacity(ns->disk, 0);
-	nvme_update_bdev_size(ns->disk);
-}
-
 static void nvme_queue_scan(struct nvme_ctrl *ctrl)
 {
 	/*
@@ -4540,6 +4520,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
+static void nvme_start_ns_queue(struct nvme_ns *ns)
+{
+	blk_mq_unquiesce_queue(ns->queue);
+}
+
+static void nvme_stop_ns_queue(struct nvme_ns *ns)
+{
+	blk_mq_quiesce_queue(ns->queue);
+}
+
+/*
+ * Prepare a queue for teardown.
+ *
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
+ * holding bd_butex.  This will end buffered writers dirtying pages that can't
+ * be synced.
+ */
+static void nvme_set_queue_dying(struct nvme_ns *ns)
+{
+	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+		return;
+
+	blk_set_queue_dying(ns->queue);
+	nvme_start_ns_queue(ns);
+
+	set_capacity(ns->disk, 0);
+	nvme_update_bdev_size(ns->disk);
+}
+
 /**
  * nvme_kill_queues(): Ends all namespace queues
  * @ctrl: the dead controller that needs to end
@@ -4618,7 +4628,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_quiesce_queue(ns->queue);
+		nvme_stop_ns_queue(ns);
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
@@ -4629,7 +4639,7 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_unquiesce_queue(ns->queue);
+		nvme_start_ns_queue(ns);
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
-- 
Gitee


From 50ee909a7dbb10778919b27a91bb05e8d7ad0f8d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:35 +0800
Subject: [PATCH 108/134] nvme: paring quiesce/unquiesce

mainline inclusion
from mainline-v5.16
commit 9e6a6b1212100148c109675e003369e3e219dbd9
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9e6a6b1212100148c109675e003369e3e219dbd9

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

The current blk_mq_quiesce_queue() and blk_mq_unquiesce_queue() always
stops and starts the queue unconditionally. And there can be concurrent
quiesce/unquiesce coming from different unrelated code paths, so
unquiesce may come unexpectedly and start queue too early.

Prepare for supporting concurrent quiesce/unquiesce from multiple
contexts, so that we can address the above issue.

NVMe has very complicated quiesce/unquiesce use pattern, add one atomic
bit for makeiing sure that blk-mq quiece/unquiesce is always called in
pair.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211014081710.1871747-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflict: commit 8c4dfea97f15 ("nvme-fabrics: reject I/O to offline
device") is not backported, struct nvme_ctrl doesn't have member flags
yet. The commit is a feature and contains a lot of code changes, thus
introduce the new member in this patch.
 - drivers/nvme/host/nvme.h
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/nvme/host/core.c | 12 ++++++++----
 drivers/nvme/host/nvme.h |  3 +++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index cdea98169131..9ccf44592fe4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4522,12 +4522,14 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
 static void nvme_start_ns_queue(struct nvme_ns *ns)
 {
-	blk_mq_unquiesce_queue(ns->queue);
+	if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
+		blk_mq_unquiesce_queue(ns->queue);
 }
 
 static void nvme_stop_ns_queue(struct nvme_ns *ns)
 {
-	blk_mq_quiesce_queue(ns->queue);
+	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
+		blk_mq_quiesce_queue(ns->queue);
 }
 
 /*
@@ -4646,13 +4648,15 @@ EXPORT_SYMBOL_GPL(nvme_start_queues);
 
 void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
 {
-	blk_mq_quiesce_queue(ctrl->admin_q);
+	if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+		blk_mq_quiesce_queue(ctrl->admin_q);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
 
 void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
 {
-	blk_mq_unquiesce_queue(ctrl->admin_q);
+	if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+		blk_mq_unquiesce_queue(ctrl->admin_q);
 }
 EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0cad04335e34..ad46208eac76 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -338,6 +338,8 @@ struct nvme_ctrl {
 	u16 icdoff;
 	u16 maxcmd;
 	int nr_reconnects;
+	unsigned long flags;
+#define NVME_CTRL_ADMIN_Q_STOPPED	0
 	struct nvmf_ctrl_options *opts;
 
 	struct page *discard_page;
@@ -449,6 +451,7 @@ struct nvme_ns {
 #define NVME_NS_REMOVING	0
 #define NVME_NS_DEAD     	1
 #define NVME_NS_ANA_PENDING	2
+#define NVME_NS_STOPPED		3
 
 	struct nvme_fault_inject fault_inject;
 
-- 
Gitee


From 754a13e02ef4101a935e337c9166fc308f184d04 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:36 +0800
Subject: [PATCH 109/134] nvme: loop: clear NVME_CTRL_ADMIN_Q_STOPPED after
 admin queue is reallocated

mainline inclusion
from mainline-v5.16
commit 1d35d519d8bf224ccdb43f9a235b8bda2d6d453c
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1d35d519d8bf224ccdb43f9a235b8bda2d6d453c

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

The nvme-loop's admin queue may be freed and reallocated, and we have to
reset the flag of NVME_CTRL_ADMIN_Q_STOPPED so that the flag can match
with the quiesce state of the admin queue.

nvme-loop is the only driver to reallocate request queue, and not see
such usage in other nvme drivers.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211014081710.1871747-6-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/nvme/target/loop.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 5b8a38c9475e..a05dc29a43f9 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -382,6 +382,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
 		error = PTR_ERR(ctrl->ctrl.admin_q);
 		goto out_cleanup_fabrics_q;
 	}
+	/* reset stopped state for the fresh admin queue */
+	clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags);
 
 	error = nvmf_connect_admin_queue(&ctrl->ctrl);
 	if (error)
-- 
Gitee


From f126f406f47346e8ddb5e22572750667e7feefc5 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:37 +0800
Subject: [PATCH 110/134] blk-mq: support concurrent queue quiesce/unquiesce

mainline inclusion
from mainline-v5.16
commit e70feb8b3e6886c525c88943b5f1508d02f5a683
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e70feb8b3e6886c525c88943b5f1508d02f5a683

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

blk_mq_quiesce_queue() has been used a bit wide now, so far we don't support
concurrent/nested quiesce. One biggest issue is that unquiesce can happen
unexpectedly in case that quiesce/unquiesce are run concurrently from
more than one context.

This patch introduces q->mq_quiesce_depth to deal concurrent quiesce,
and we only unquiesce queue when it is the last/outer-most one of all
contexts.

Several kernel panic issue has been reported[1][2][3] when running stress
quiesce test. And this patch has been verified in these reports.

[1] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m1fc52431fad7f33b1ffc3f12c4450e4238540787
[2] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m10ad90afeb9c8cc318334190a7c24c8b5c5e0722
[3] https://listman.redhat.com/archives/dm-devel/2021-September/msg00189.html

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211014081710.1871747-7-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-mq.c         | 22 +++++++++++++++++++---
 include/linux/blkdev.h |  2 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 70301cc6b18d..1af4f9bbc0fe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -206,7 +206,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
  */
 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 {
-	blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->queue_lock, flags);
+	if (!q->quiesce_depth++)
+		blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+	spin_unlock_irqrestore(&q->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
@@ -247,10 +252,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
  */
 void blk_mq_unquiesce_queue(struct request_queue *q)
 {
-	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+	unsigned long flags;
+	bool run_queue = false;
+
+	spin_lock_irqsave(&q->queue_lock, flags);
+	if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
+		;
+	} else if (!--q->quiesce_depth) {
+		blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+		run_queue = true;
+	}
+	spin_unlock_irqrestore(&q->queue_lock, flags);
 
 	/* dispatch requests which are inserted during quiescing */
-	blk_mq_run_hw_queues(q, true);
+	if (run_queue)
+		blk_mq_run_hw_queues(q, true);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7517381b5e15..d6ae309a7dc3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -571,6 +571,8 @@ struct request_queue {
 	 */
 	struct mutex		mq_freeze_lock;
 
+	int			quiesce_depth;
+
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
 	struct bio_set		bio_split;
-- 
Gitee


From 85442ae4e5f794069953506f1b867908beeca72e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:38 +0800
Subject: [PATCH 111/134] dm: don't stop request queue after the dm device is
 suspended

mainline inclusion
from mainline-v5.16
commit a1c2f7e7f25c9d35d3bf046f99682c5373b20fa2
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a1c2f7e7f25c9d35d3bf046f99682c5373b20fa2

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

For fixing queue quiesce race between driver and block layer(elevator
switch, update nr_requests, ...), we need to support concurrent quiesce
and unquiesce, which requires the two call to be balanced.

__bind() is only called from dm_swap_table() in which dm device has been
suspended already, so not necessary to stop queue again. With this way,
request queue quiesce and unquiesce can be balanced.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/md/dm.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 19a70f434029..3403722b1688 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2038,16 +2038,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	dm_table_event_callback(t, event_callback, md);
 
-	/*
-	 * The queue hasn't been stopped yet, if the old table type wasn't
-	 * for request-based during suspension.  So stop it to prevent
-	 * I/O mapping before resume.
-	 * This must be done before setting the queue restrictions,
-	 * because request-based dm may be run just after the setting.
-	 */
-	if (request_based)
-		dm_stop_queue(q);
-
 	if (request_based) {
 		/*
 		 * Leverage the fact that request-based DM targets are
-- 
Gitee


From b41c4d926dafdf0943ec6944345ef4216dddef3d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:39 +0800
Subject: [PATCH 112/134] scsi: avoid to quiesce sdev->request_queue two times

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

For fixing queue quiesce race between driver and block layer(elevator
switch, update nr_requests, ...), we need to support concurrent quiesce
and unquiesce, which requires the two to be balanced.

blk_mq_quiesce_queue() calls blk_mq_quiesce_queue_nowait() for updating
quiesce depth and marking the flag, then scsi_internal_device_block() calls
blk_mq_quiesce_queue_nowait() two times actually.

Fix the double quiesce and keep quiesce and unquiesce balanced.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/scsi/scsi_lib.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b26c922bd65d..2561d48dfade 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2618,6 +2618,14 @@ scsi_target_resume(struct scsi_target *starget)
 }
 EXPORT_SYMBOL(scsi_target_resume);
 
+static int __scsi_internal_device_block_nowait(struct scsi_device *sdev)
+{
+	if (scsi_device_set_state(sdev, SDEV_BLOCK))
+		return scsi_device_set_state(sdev, SDEV_CREATED_BLOCK);
+
+	return 0;
+}
+
 /**
  * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state
  * @sdev: device to block
@@ -2634,24 +2642,16 @@ EXPORT_SYMBOL(scsi_target_resume);
  */
 int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 {
-	struct request_queue *q = sdev->request_queue;
-	int err = 0;
-
-	err = scsi_device_set_state(sdev, SDEV_BLOCK);
-	if (err) {
-		err = scsi_device_set_state(sdev, SDEV_CREATED_BLOCK);
-
-		if (err)
-			return err;
-	}
+	int ret = __scsi_internal_device_block_nowait(sdev);
 
 	/*
 	 * The device has transitioned to SDEV_BLOCK.  Stop the
 	 * block layer from calling the midlayer with this device's
 	 * request queue.
 	 */
-	blk_mq_quiesce_queue_nowait(q);
-	return 0;
+	if (!ret)
+		blk_mq_quiesce_queue_nowait(sdev->request_queue);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
 
@@ -2672,13 +2672,12 @@ EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
  */
 static int scsi_internal_device_block(struct scsi_device *sdev)
 {
-	struct request_queue *q = sdev->request_queue;
 	int err;
 
 	mutex_lock(&sdev->state_mutex);
-	err = scsi_internal_device_block_nowait(sdev);
+	err = __scsi_internal_device_block_nowait(sdev);
 	if (err == 0)
-		blk_mq_quiesce_queue(q);
+		blk_mq_quiesce_queue(sdev->request_queue);
 	mutex_unlock(&sdev->state_mutex);
 
 	return err;
-- 
Gitee


From 573b1d2ad4c3e381a7cb60e4b9def44201097bb6 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 15 Nov 2021 19:53:40 +0800
Subject: [PATCH 113/134] scsi: make sure that request queue queiesce and
 unquiesce balanced

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

For fixing queue quiesce race between driver and block layer(elevator
switch, update nr_requests, ...), we need to support concurrent quiesce
and unquiesce, which requires the two call balanced.

It isn't easy to audit that in all scsi drivers, especially the two may
be called from different contexts, so do it in scsi core with one per-device
bit flag & global spinlock, basically zero cost since request queue quiesce
is seldom triggered.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/scsi/scsi_lib.c    | 45 ++++++++++++++++++++++++++++++--------
 include/scsi/scsi_device.h |  1 +
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2561d48dfade..9582aabf2ff1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2626,6 +2626,40 @@ static int __scsi_internal_device_block_nowait(struct scsi_device *sdev)
 	return 0;
 }
 
+static DEFINE_SPINLOCK(sdev_queue_stop_lock);
+
+void scsi_start_queue(struct scsi_device *sdev)
+{
+	bool need_start;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sdev_queue_stop_lock, flags);
+	need_start = sdev->queue_stopped;
+	sdev->queue_stopped = 0;
+	spin_unlock_irqrestore(&sdev_queue_stop_lock, flags);
+
+	if (need_start)
+		blk_mq_unquiesce_queue(sdev->request_queue);
+}
+
+static void scsi_stop_queue(struct scsi_device *sdev, bool nowait)
+{
+	bool need_stop;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sdev_queue_stop_lock, flags);
+	need_stop = !sdev->queue_stopped;
+	sdev->queue_stopped = 1;
+	spin_unlock_irqrestore(&sdev_queue_stop_lock, flags);
+
+	if (need_stop) {
+		if (nowait)
+			blk_mq_quiesce_queue_nowait(sdev->request_queue);
+		else
+			blk_mq_quiesce_queue(sdev->request_queue);
+	}
+}
+
 /**
  * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state
  * @sdev: device to block
@@ -2650,7 +2684,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 	 * request queue.
 	 */
 	if (!ret)
-		blk_mq_quiesce_queue_nowait(sdev->request_queue);
+		scsi_stop_queue(sdev, true);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
@@ -2677,19 +2711,12 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
 	mutex_lock(&sdev->state_mutex);
 	err = __scsi_internal_device_block_nowait(sdev);
 	if (err == 0)
-		blk_mq_quiesce_queue(sdev->request_queue);
+		scsi_stop_queue(sdev, false);
 	mutex_unlock(&sdev->state_mutex);
 
 	return err;
 }
 
-void scsi_start_queue(struct scsi_device *sdev)
-{
-	struct request_queue *q = sdev->request_queue;
-
-	blk_mq_unquiesce_queue(q);
-}
-
 /**
  * scsi_internal_device_unblock_nowait - resume a device after a block request
  * @sdev:	device to resume
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 1a5c9a3df6d6..2b8c3df03f13 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -204,6 +204,7 @@ struct scsi_device {
 	unsigned unmap_limit_for_ws:1;	/* Use the UNMAP limit for WRITE SAME */
 	unsigned rpm_autosuspend:1;	/* Enable runtime autosuspend at device
 					 * creation time */
+	unsigned queue_stopped:1;	/* request queue is quiesced */
 
 	bool offline_already;		/* Device offline message logged */
 
-- 
Gitee


From a7c2894c8f4e306e184d908b6ad7dbd45a706fbc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 15 Nov 2021 19:53:41 +0800
Subject: [PATCH 114/134] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed
 mode

mainline inclusion
from mainline-v5.15-rc2
commit 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8520e224f547cd070c7c8f97b1fc6d58cff7ccaa

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------

Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).

The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.

Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.

In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.

Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.

  [0] https://github.com/nestybox/sysbox/
  [1] https://kind.sigs.k8s.io/

Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
Conflicts:
	include/linux/cgroup-defs.h
	net/core/netclassid_cgroup.c
	net/core/netprio_cgroup.c
Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 include/linux/cgroup-defs.h  | 107 +++++++++--------------------------
 include/linux/cgroup.h       |  22 +------
 kernel/cgroup/cgroup.c       |  50 ++++------------
 net/core/netclassid_cgroup.c |   7 +--
 net/core/netprio_cgroup.c    |  11 +---
 5 files changed, 42 insertions(+), 155 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ffec16930b00..97ce0a12367c 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -766,107 +766,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
  * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
  * per-socket cgroup information except for memcg association.
  *
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overriden to carry prioidx and/or
- * classid.  The two modes are distinguished by whether the lowest bit is
- * set.  Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works.  There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled.  On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked.  While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
  */
 struct sock_cgroup_data {
-	union {
-#ifdef __LITTLE_ENDIAN
-		struct {
-			u8	is_data : 1;
-			u8	no_refcnt : 1;
-			u8	unused : 6;
-			u8	padding;
-			u16	prioidx;
-			u32	classid;
-		} __packed;
-#else
-		struct {
-			u32	classid;
-			u16	prioidx;
-			u8	padding;
-			u8	unused : 6;
-			u8	no_refcnt : 1;
-			u8	is_data : 1;
-		} __packed;
+	struct cgroup	*cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	u32		classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+	u16		prioidx; /* v1 */
 #endif
-		u64		val;
-	};
 };
 
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid.  Such races are short-lived and the result isn't critical.
- */
 static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
 {
-	/* fallback to 1 which is always the ID of the root cgroup */
-	return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+	return READ_ONCE(skcd->prioidx);
+#else
+	return 1;
+#endif
 }
 
 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
 {
-	/* fallback to 0 which is the unconfigured default classid */
-	return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	return READ_ONCE(skcd->classid);
+#else
+	return 0;
+#endif
 }
 
-/*
- * If invoked concurrently, the updaters may clobber each other.  The
- * caller is responsible for synchronization.
- */
 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 					   u16 prioidx)
 {
-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
-	if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
-		return;
-
-	if (!(skcd_buf.is_data & 1)) {
-		skcd_buf.val = 0;
-		skcd_buf.is_data = 1;
-	}
-
-	skcd_buf.prioidx = prioidx;
-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+	WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
 }
 
 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
 					   u32 classid)
 {
-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
-	if (sock_cgroup_classid(&skcd_buf) == classid)
-		return;
-
-	if (!(skcd_buf.is_data & 1)) {
-		skcd_buf.val = 0;
-		skcd_buf.is_data = 1;
-	}
-
-	skcd_buf.classid = classid;
-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	WRITE_ONCE(skcd->classid, classid);
+#endif
 }
 
 #else	/* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 618838c48313..9c88b7da3245 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -816,33 +816,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
  */
 #ifdef CONFIG_SOCK_CGROUP_DATA
 
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
 void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
 void cgroup_sk_clone(struct sock_cgroup_data *skcd);
 void cgroup_sk_free(struct sock_cgroup_data *skcd);
 
 static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
 {
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-	unsigned long v;
-
-	/*
-	 * @skcd->val is 64bit but the following is safe on 32bit too as we
-	 * just need the lower ulong to be written and read atomically.
-	 */
-	v = READ_ONCE(skcd->val);
-
-	if (v & 3)
-		return &cgrp_dfl_root.cgrp;
-
-	return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
-	return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+	return skcd->cgroup;
 }
 
 #else	/* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 017bfc698109..e735da6a4d4e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6439,74 +6439,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
  */
 #ifdef CONFIG_SOCK_CGROUP_DATA
 
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
-	if (cgroup_sk_alloc_disabled)
-		return;
-	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
-	cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled	false
-
-#endif
-
 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 {
-	if (cgroup_sk_alloc_disabled) {
-		skcd->no_refcnt = 1;
-		return;
-	}
-
 	/* Don't associate the sock with unrelated interrupted task's cgroup. */
 	if (in_interrupt())
 		return;
 
 	rcu_read_lock();
-
 	while (true) {
 		struct css_set *cset;
 
 		cset = task_css_set(current);
 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
-			skcd->val = (unsigned long)cset->dfl_cgrp;
+			skcd->cgroup = cset->dfl_cgrp;
 			cgroup_bpf_get(cset->dfl_cgrp);
 			break;
 		}
 		cpu_relax();
 	}
-
 	rcu_read_unlock();
 }
 
 void cgroup_sk_clone(struct sock_cgroup_data *skcd)
 {
-	if (skcd->val) {
-		if (skcd->no_refcnt)
-			return;
-		/*
-		 * We might be cloning a socket which is left in an empty
-		 * cgroup and the cgroup might have already been rmdir'd.
-		 * Don't use cgroup_get_live().
-		 */
-		cgroup_get(sock_cgroup_ptr(skcd));
-		cgroup_bpf_get(sock_cgroup_ptr(skcd));
-	}
+	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+	/*
+	 * We might be cloning a socket which is left in an empty
+	 * cgroup and the cgroup might have already been rmdir'd.
+	 * Don't use cgroup_get_live().
+	 */
+	cgroup_get(cgrp);
+	cgroup_bpf_get(cgrp);
 }
 
 void cgroup_sk_free(struct sock_cgroup_data *skcd)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
 
-	if (skcd->no_refcnt)
-		return;
 	cgroup_bpf_put(cgrp);
 	cgroup_put(cgrp);
 }
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 41b24cd31562..b6de5ee22391 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -72,11 +72,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
 	struct update_classid_context *ctx = (void *)v;
 	struct socket *sock = sock_from_file(file, &err);
 
-	if (sock) {
-		spin_lock(&cgroup_sk_update_lock);
+	if (sock)
 		sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
-		spin_unlock(&cgroup_sk_update_lock);
-	}
 	if (--ctx->batch == 0) {
 		ctx->batch = UPDATE_CLASSID_BATCH;
 		return n + 1;
@@ -122,8 +119,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
 	struct css_task_iter it;
 	struct task_struct *p;
 
-	cgroup_sk_alloc_disable();
-
 	cs->classid = (u32)value;
 
 	css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9bd4cab7d510..34c3e49814e2 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 	if (!dev)
 		return -ENODEV;
 
-	cgroup_sk_alloc_disable();
-
 	rtnl_lock();
 
 	ret = netprio_set_prio(of_css(of), dev, prio);
@@ -222,12 +220,11 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
 {
 	int err;
 	struct socket *sock = sock_from_file(file, &err);
-	if (sock) {
-		spin_lock(&cgroup_sk_update_lock);
+
+	if (sock)
 		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
 					(unsigned long)v);
-		spin_unlock(&cgroup_sk_update_lock);
-	}
+
 	return 0;
 }
 
@@ -236,8 +233,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
 	struct task_struct *p;
 	struct cgroup_subsys_state *css;
 
-	cgroup_sk_alloc_disable();
-
 	cgroup_taskset_for_each(p, css, tset) {
 		void *v = (void *)(unsigned long)css->id;
 
-- 
Gitee


From 99027e355eb6358cccb3b1b23763ca78e5e99948 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 15 Nov 2021 19:53:42 +0800
Subject: [PATCH 115/134] bpf, cgroup: Assign cgroup in cgroup_sk_alloc when
 called from interrupt

mainline inclusion
from mainline-v5.15-rc4
commit 78cc316e9583067884eb8bd154301dc1e9ee945c
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=78cc316e9583067884eb8bd154301dc1e9ee945c

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------

If cgroup_sk_alloc() is called from interrupt context, then just assign the
root cgroup to skcd->cgroup. Prior to commit 8520e224f547 ("bpf, cgroups:
Fix cgroup v2 fallback on v1/v2 mixed mode") we would just return, and later
on in sock_cgroup_ptr(), we were NULL-testing the cgroup in fast-path, and
iff indeed NULL returning the root cgroup (v ?: &cgrp_dfl_root.cgrp). Rather
than re-adding the NULL-test to the fast-path we can just assign it once from
cgroup_sk_alloc() given v1/v2 handling has been simplified. The migration from
NULL test with returning &cgrp_dfl_root.cgrp to assigning &cgrp_dfl_root.cgrp
directly does /not/ change behavior for callers of sock_cgroup_ptr().

syzkaller was able to trigger a splat in the legacy netrom code base, where
the RX handler in nr_rx_frame() calls nr_make_new() which calls sk_alloc()
and therefore cgroup_sk_alloc() with in_interrupt() condition. Thus the NULL
skcd->cgroup, where it trips over on cgroup_sk_free() side given it expects
a non-NULL object. There are a few other candidates aside from netrom which
have similar pattern where in their accept-like implementation, they just call
to sk_alloc() and thus cgroup_sk_alloc() instead of sk_clone_lock() with the
corresponding cgroup_sk_clone() which then inherits the cgroup from the parent
socket. None of them are related to core protocols where BPF cgroup programs
are running from. However, in future, they should follow to implement a similar
inheritance mechanism.

Additionally, with a !CONFIG_CGROUP_NET_PRIO and !CONFIG_CGROUP_NET_CLASSID
configuration, the same issue was exposed also prior to 8520e224f547 due to
commit e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated
cgroup") which added the early in_interrupt() return back then.

Fixes: 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode")
Fixes: e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup")
Reported-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com
Reported-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com
Tested-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/bpf/20210927123921.21535-1-daniel@iogearbox.net
Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/cgroup/cgroup.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e735da6a4d4e..350297ad63f1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6441,22 +6441,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
 
 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 {
-	/* Don't associate the sock with unrelated interrupted task's cgroup. */
-	if (in_interrupt())
-		return;
+	struct cgroup *cgroup;
 
 	rcu_read_lock();
+	/* Don't associate the sock with unrelated interrupted task's cgroup. */
+	if (in_interrupt()) {
+		cgroup = &cgrp_dfl_root.cgrp;
+		cgroup_get(cgroup);
+		goto out;
+	}
+
 	while (true) {
 		struct css_set *cset;
 
 		cset = task_css_set(current);
 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
-			skcd->cgroup = cset->dfl_cgrp;
-			cgroup_bpf_get(cset->dfl_cgrp);
+			cgroup = cset->dfl_cgrp;
 			break;
 		}
 		cpu_relax();
 	}
+out:
+	skcd->cgroup = cgroup;
+	cgroup_bpf_get(cgroup);
 	rcu_read_unlock();
 }
 
-- 
Gitee


From e1d3894a000df9e757b6112954ad80c629c71112 Mon Sep 17 00:00:00 2001
From: Guo Xuenan <guoxuenan@huawei.com>
Date: Mon, 15 Nov 2021 19:53:43 +0800
Subject: [PATCH 116/134] ubi: fix slab-out-of-bounds in
 ubi_eba_get_ldesc+0xfb/0x130

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

When using ioctl interface to resize ubi volume, ubi_resize_volume will
resize eba table first, but not change vol->reserved_pebs in the same
atomic context which may cause concurrency access eba table.

For example, When user do shrink ubi volume A calling ubi_resize_volume,
while the other thread is writing (volume B) and triggering wear-leveling,
which may calling ubi_write_fastmap, under these circumstances, KASAN may
report: slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130.

The main work of this patch include:
1. fix races in ubi_resize_volume and ubi_update_fastmap, to avoid
   eba_tbl read out of bounds. first, we make eba_tbl and reserved_pebs
   updating under the protect of vol->volumes_lock. second, rollback
   volume in case of resize failure. Also mention that for volume
   shrinking failure, since part of volume has been shrunk and unmapped,
   there is no need to recover {rsvd/avail}_pebs.
2. fix some memleak in error path of ubi_resize_volume when destroy
   new_eba_tbl.

BUG: KASAN: slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130 [ubi]
Read of size 4 at addr ffff888013ff7170 by task kworker/u16:3/160
CPU: 6 PID: 160 Comm: kworker/u16:3 Not tainted
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.14.0-1.fc33 04/01/2014
Workqueue: writeback wb_workfn (flush-ubifs_0_0)
Call Trace:
 dump_stack+0x9c/0xcf
 print_address_description.constprop.0+0x1c/0x220
 kasan_report.cold+0x1f/0x37
 ubi_eba_get_ldesc+0xfb/0x130 [ubi]
 ubi_update_fastmap.cold+0x6be/0xc6b [ubi]
 ubi_wl_get_peb+0x2a2/0x580 [ubi]
 try_write_vid_and_data+0x9a/0x4d0 [ubi]
 ubi_eba_write_leb+0x780/0x1890 [ubi]
 ubi_leb_map+0x197/0x2c0 [ubi]
 ubifs_leb_map+0x139/0x240 [ubifs]
 ubifs_add_bud_to_log+0xb02/0xea0 [ubifs]
 make_reservation+0x860/0xb40 [ubifs]
 ubifs_jnl_write_data+0x461/0x9b0 [ubifs]
 do_writepage+0x375/0x550 [ubifs]
 ubifs_writepage+0x3a4/0x670 [ubifs]
 __writepage+0x61/0x160
 write_cache_pages+0x433/0xb70
 do_writepages+0x1ad/0x260
 __writeback_single_inode+0xb3/0x810
 writeback_sb_inodes+0x4d9/0xcc0
 __writeback_inodes_wb+0xc1/0x260
 wb_writeback+0x585/0x760
 wb_workfn+0x751/0xdb0
 process_one_work+0x6e5/0xf10
 worker_thread+0x5e1/0x10c0
 kthread+0x335/0x400
 ret_from_fork+0x1f/0x30

Allocated by task 690:
 kasan_save_stack+0x1b/0x40
 __kasan_kmalloc.constprop.0+0xc2/0xd0
 ubi_eba_create_table+0x88/0x1a0 [ubi]
 ubi_resize_volume.cold+0x166/0xc75 [ubi]
 ubi_cdev_ioctl+0x57f/0x1aa0 [ubi]
 __x64_sys_ioctl+0x141/0x1a0
 do_syscall_64+0x2d/0x40
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

The following steps can used to reproduce:
Process 1: write and trigger ubi wear-leveling
    ubimkvol /dev/ubi0 -s 5000MiB -N v1
    ubimkvol /dev/ubi0 -s 2000MiB -N v2
    ubimkvol /dev/ubi0 -s 10MiB -N v3
    mount -t ubifs /dev/ubi0_0 /mnt/ubifs
    while true;
    do
        filename=/mnt/ubifs/$((RANDOM))
        dd if=/dev/random of=${filename} bs=1M count=$((RANDOM % 1000))
        rm -rf ${filename}
        sync /mnt/ubifs/
    done

Process 2: do random resize
    struct ubi_rsvol_req req;
    req.vol_id = 1;
    req.bytes = (rand() % 50) * 512KB;
    ioctl(fd, UBI_IOCRSVOL, &req);

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Reviewed-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/mtd/ubi/vmt.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c
index 1bc7b3a05604..04d0e217ea1c 100644
--- a/drivers/mtd/ubi/vmt.c
+++ b/drivers/mtd/ubi/vmt.c
@@ -409,6 +409,7 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
 	struct ubi_device *ubi = vol->ubi;
 	struct ubi_vtbl_record vtbl_rec;
 	struct ubi_eba_table *new_eba_tbl = NULL;
+	struct ubi_eba_table *old_eba_tbl = NULL;
 	int vol_id = vol->vol_id;
 
 	if (ubi->ro_mode)
@@ -454,10 +455,13 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
 			err = -ENOSPC;
 			goto out_free;
 		}
+
 		ubi->avail_pebs -= pebs;
 		ubi->rsvd_pebs += pebs;
 		ubi_eba_copy_table(vol, new_eba_tbl, vol->reserved_pebs);
-		ubi_eba_replace_table(vol, new_eba_tbl);
+		old_eba_tbl = vol->eba_tbl;
+		vol->eba_tbl = new_eba_tbl;
+		vol->reserved_pebs = reserved_pebs;
 		spin_unlock(&ubi->volumes_lock);
 	}
 
@@ -465,14 +469,16 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
 		for (i = 0; i < -pebs; i++) {
 			err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i);
 			if (err)
-				goto out_acc;
+				goto out_free;
 		}
 		spin_lock(&ubi->volumes_lock);
 		ubi->rsvd_pebs += pebs;
 		ubi->avail_pebs -= pebs;
 		ubi_update_reserved(ubi);
 		ubi_eba_copy_table(vol, new_eba_tbl, reserved_pebs);
-		ubi_eba_replace_table(vol, new_eba_tbl);
+		old_eba_tbl = vol->eba_tbl;
+		vol->eba_tbl = new_eba_tbl;
+		vol->reserved_pebs = reserved_pebs;
 		spin_unlock(&ubi->volumes_lock);
 	}
 
@@ -494,27 +500,32 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
 	if (err)
 		goto out_acc;
 
-	vol->reserved_pebs = reserved_pebs;
 	if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
 		vol->used_ebs = reserved_pebs;
 		vol->last_eb_bytes = vol->usable_leb_size;
 		vol->used_bytes =
 			(long long)vol->used_ebs * vol->usable_leb_size;
 	}
-
+	/* destroy old table */
+	ubi_eba_destroy_table(old_eba_tbl);
 	ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED);
 	self_check_volumes(ubi);
 	return err;
 
 out_acc:
+	spin_lock(&ubi->volumes_lock);
+	vol->reserved_pebs = reserved_pebs - pebs;
 	if (pebs > 0) {
-		spin_lock(&ubi->volumes_lock);
 		ubi->rsvd_pebs -= pebs;
 		ubi->avail_pebs += pebs;
-		spin_unlock(&ubi->volumes_lock);
+		ubi_eba_copy_table(vol, old_eba_tbl, vol->reserved_pebs);
+	} else {
+		ubi_eba_copy_table(vol, old_eba_tbl, reserved_pebs);
 	}
+	vol->eba_tbl = old_eba_tbl;
+	spin_unlock(&ubi->volumes_lock);
 out_free:
-	kfree(new_eba_tbl);
+	ubi_eba_destroy_table(new_eba_tbl);
 	return err;
 }
 
-- 
Gitee


From a65a96a39ce2be562f340d52c8a5146592b4f607 Mon Sep 17 00:00:00 2001
From: Zheng Liang <zhengliang6@huawei.com>
Date: Mon, 15 Nov 2021 19:53:47 +0800
Subject: [PATCH 117/134] squashfs: provides backing_dev_info in order to
 disable read-ahead

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

the commit c1f6925e1091("mm: put readahead pages in cache earlier")
causes the read performance of squashfs to deteriorate.Through testing,
we find that the performance will be back by closing the readahead of
squashfs. So we want to learn the way of ubifs, provides backing_dev_info
and disable read-ahead.

Signed-off-by: Zheng Liang <zhengliang6@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/squashfs/super.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88cc94be1076..f7128ae7b949 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/xattr.h>
+#include <linux/backing-dev.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -64,6 +65,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
 	return decompressor;
 }
 
+static int squashfs_bdi_init(struct super_block *sb)
+{
+	int err;
+	unsigned int major = MAJOR(sb->s_dev);
+	unsigned int minor = MINOR(sb->s_dev);
+
+	bdi_put(sb->s_bdi);
+	sb->s_bdi = &noop_backing_dev_info;
+
+	err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
+	if (err)
+		return err;
+
+	sb->s_bdi->ra_pages = 0;
+	sb->s_bdi->io_pages = 0;
+
+	return 0;
+}
 
 static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
@@ -78,6 +97,19 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	TRACE("Entered squashfs_fill_superblock\n");
 
+	/*
+	 * squashfs provides 'backing_dev_info' in order to disable read-ahead. For
+	 * squashfs, I/O is not deferred, it is done immediately in readpage,
+	 * which means the user would have to wait not just for their own I/O
+	 * but the read-ahead I/O as well i.e. completely pointless.squashfs_bdi_init
+	 * will set sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0.
+	 */
+	err = squashfs_bdi_init(sb);
+	if (err) {
+		errorf(fc, "squashfs init bdi failed");
+		return err;
+	}
+
 	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
 	if (sb->s_fs_info == NULL) {
 		ERROR("Failed to allocate squashfs_sb_info\n");
-- 
Gitee


From 5196327c6f70763d03841f02d0332a2571e7cc29 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 15 Nov 2021 19:53:49 +0800
Subject: [PATCH 118/134] net: phy: fix duplex out of sync problem while
 changing settings

mainline inclusion
from mainline
commit: a4db9055fdb9cf607775c66d39796caf6439ec92
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a4db9055fdb9cf607775c66d39796caf6439ec92

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

As reported by Zhang there's a small issue if in forced mode the duplex
mode changes with the link staying up [0]. In this case the MAC isn't
notified about the change.

The proposed patch relies on the phylib state machine and ignores the
fact that there are drivers that uses phylib but not the phylib state
machine. So let's don't change the behavior for such drivers and fix
it w/o re-adding state PHY_FORCING for the case that phylib state
machine is used.

[0] https://lore.kernel.org/netdev/a5c26ffd-4ee4-a5e6-4103-873208ce0dc5@huawei.com/T/

Fixes: 2bd229df5e2e ("net: phy: remove state PHY_FORCING")
Reported-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Tested-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/7b8b9456-a93f-abbc-1dc5-a2c2542f932c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/net/phy/phy.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 5ee7cde0c2e9..db7866b6f752 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -831,7 +831,12 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev,
 	phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl;
 
 	/* Restart the PHY */
-	_phy_start_aneg(phydev);
+	if (phy_is_started(phydev)) {
+		phydev->state = PHY_UP;
+		phy_trigger_machine(phydev);
+	} else {
+		_phy_start_aneg(phydev);
+	}
 
 	mutex_unlock(&phydev->lock);
 	return 0;
-- 
Gitee


From 8556aedf2dd0b93db9806f7a978be83fee162567 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Tue, 30 Nov 2021 16:32:03 +0800
Subject: [PATCH 119/134] ubifs: fix slab-out-of-bounds in ubifs_change_lp

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Hulk Robot reported a KASAN report about slab-out-of-bounds:
 ==================================================================
 BUG: KASAN: slab-out-of-bounds in ubifs_change_lp+0x3a9/0x1390 [ubifs]
 Read of size 8 at addr ffff888101c961f8 by task fsstress/1068
 [...]
 Call Trace:
  check_memory_region+0x1c1/0x1e0
  ubifs_change_lp+0x3a9/0x1390 [ubifs]
  ubifs_change_one_lp+0x170/0x220 [ubifs]
  ubifs_garbage_collect+0x7f9/0xda0 [ubifs]
  ubifs_budget_space+0xfe4/0x1bd0 [ubifs]
  ubifs_write_begin+0x528/0x10c0 [ubifs]

 Allocated by task 1068:
  kmemdup+0x25/0x50
  ubifs_lpt_lookup_dirty+0x372/0xb00 [ubifs]
  ubifs_update_one_lp+0x46/0x260 [ubifs]
  ubifs_tnc_end_commit+0x98b/0x1720 [ubifs]
  do_commit+0x6cb/0x1950 [ubifs]
  ubifs_run_commit+0x15a/0x2b0 [ubifs]
  ubifs_budget_space+0x1061/0x1bd0 [ubifs]
  ubifs_write_begin+0x528/0x10c0 [ubifs]
 [...]
 ==================================================================

In ubifs_garbage_collect(), if ubifs_find_dirty_leb returns an error,
lp is an uninitialized variable. But lp.num might be used in the out
branch, which is a random value. If the value is -1 or another value
that can pass the check, soob may occur in the ubifs_change_lp() in
the following procedure.

To solve this problem, we initialize lp.lnum to -1, and then initialize
it correctly in ubifs_find_dirty_leb, which is not equal to -1, and
ubifs_return_leb is executed only when lp.lnum != -1.

if find a retained or indexing LEB and continue to next loop, but break
before find another LEB, the "taken" flag of this LEB will be cleaned
in ubi_return_lebi(). This bug has also been fixed in this patch.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/gc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index dc3e26e9ed7b..05e1eeae8457 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -692,6 +692,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 	for (i = 0; ; i++) {
 		int space_before, space_after;
 
+		/* Maybe continue after find and break before find */
+		lp.lnum = -1;
+
 		cond_resched();
 
 		/* Give the commit an opportunity to run */
@@ -843,7 +846,8 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 	ubifs_wbuf_sync_nolock(wbuf);
 	ubifs_ro_mode(c, ret);
 	mutex_unlock(&wbuf->io_mutex);
-	ubifs_return_leb(c, lp.lnum);
+	if (lp.lnum != -1)
+		ubifs_return_leb(c, lp.lnum);
 	return ret;
 }
 
-- 
Gitee


From 3984364bc3db5e2e434b5409394145db8bd3e7e0 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Tue, 30 Nov 2021 16:32:04 +0800
Subject: [PATCH 120/134] ubifs: fix double return leb in ubifs_garbage_collect

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

If ubifs_garbage_collect_leb() returns -EAGAIN and enters the "out"
branch, ubifs_return_leb will execute twice on the same lnum. This
can cause data loss in concurrency situations.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/gc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 05e1eeae8457..238a60294d10 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -758,6 +758,8 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 				err = ubifs_return_leb(c, lp.lnum);
 				if (err)
 					ret = err;
+				/*  Maybe double return if go out */
+				lp.lnum = -1;
 				break;
 			}
 			goto out;
-- 
Gitee


From 8b9f8be4671a855818fdf826ec4178052aeb1bb7 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Tue, 30 Nov 2021 16:32:05 +0800
Subject: [PATCH 121/134] ubifs: read-only if LEB may always be taken in
 ubifs_garbage_collect

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

If ubifs_garbage_collect_leb() returns -EAGAIN and ubifs_return_leb
returns error, a LEB will always has a "taken" flag. In this case,
set the ubifs to read-only to prevent a worse situation.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ubifs/gc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 238a60294d10..e43c8161423d 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -756,8 +756,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 				 * caller instead of the original '-EAGAIN'.
 				 */
 				err = ubifs_return_leb(c, lp.lnum);
-				if (err)
+				if (err) {
 					ret = err;
+					/* LEB may always be "taken". So set
+					 * the ubifs to read-only. Sync wbuf
+					 * will return -EROFS, then go "out".
+					 */
+					ubifs_ro_mode(c, ret);
+				}
 				/*  Maybe double return if go out */
 				lp.lnum = -1;
 				break;
-- 
Gitee


From c001574436a8a5f477992703380f2f741d520f35 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin@huawei.com>
Date: Tue, 30 Nov 2021 16:32:06 +0800
Subject: [PATCH 122/134] blkcg: Remove extra blkcg_bio_issue_init

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

KASAN reports a use-after-free report when doing block test:

==================================================================
[10050.967049] BUG: KASAN: use-after-free in
submit_bio_checks+0x1539/0x1550

[10050.977638] Call Trace:
[10050.978190]  dump_stack+0x9b/0xce
[10050.979674]  print_address_description.constprop.6+0x3e/0x60
[10050.983510]  kasan_report.cold.9+0x22/0x3a
[10050.986089]  submit_bio_checks+0x1539/0x1550
[10050.989576]  submit_bio_noacct+0x83/0xc80
[10050.993714]  submit_bio+0xa7/0x330
[10050.994435]  mpage_readahead+0x380/0x500
[10050.998009]  read_pages+0x1c1/0xbf0
[10051.002057]  page_cache_ra_unbounded+0x4c2/0x6f0
[10051.007413]  do_page_cache_ra+0xda/0x110
[10051.008207]  force_page_cache_ra+0x23d/0x3d0
[10051.009087]  page_cache_sync_ra+0xca/0x300
[10051.009970]  generic_file_buffered_read+0xbea/0x2130
[10051.012685]  generic_file_read_iter+0x315/0x490
[10051.014472]  blkdev_read_iter+0x113/0x1b0
[10051.015300]  aio_read+0x2ad/0x450
[10051.023786]  io_submit_one+0xc8e/0x1d60
[10051.029855]  __se_sys_io_submit+0x125/0x350
[10051.033442]  do_syscall_64+0x2d/0x40
[10051.034156]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[10051.048733] Allocated by task 18598:
[10051.049482]  kasan_save_stack+0x19/0x40
[10051.050263]  __kasan_kmalloc.constprop.1+0xc1/0xd0
[10051.051230]  kmem_cache_alloc+0x146/0x440
[10051.052060]  mempool_alloc+0x125/0x2f0
[10051.052818]  bio_alloc_bioset+0x353/0x590
[10051.053658]  mpage_alloc+0x3b/0x240
[10051.054382]  do_mpage_readpage+0xddf/0x1ef0
[10051.055250]  mpage_readahead+0x264/0x500
[10051.056060]  read_pages+0x1c1/0xbf0
[10051.056758]  page_cache_ra_unbounded+0x4c2/0x6f0
[10051.057702]  do_page_cache_ra+0xda/0x110
[10051.058511]  force_page_cache_ra+0x23d/0x3d0
[10051.059373]  page_cache_sync_ra+0xca/0x300
[10051.060198]  generic_file_buffered_read+0xbea/0x2130
[10051.061195]  generic_file_read_iter+0x315/0x490
[10051.062189]  blkdev_read_iter+0x113/0x1b0
[10051.063015]  aio_read+0x2ad/0x450
[10051.063686]  io_submit_one+0xc8e/0x1d60
[10051.064467]  __se_sys_io_submit+0x125/0x350
[10051.065318]  do_syscall_64+0x2d/0x40
[10051.066082]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[10051.067455] Freed by task 13307:
[10051.068136]  kasan_save_stack+0x19/0x40
[10051.068931]  kasan_set_track+0x1c/0x30
[10051.069726]  kasan_set_free_info+0x1b/0x30
[10051.070621]  __kasan_slab_free+0x111/0x160
[10051.071480]  kmem_cache_free+0x94/0x460
[10051.072256]  mempool_free+0xd6/0x320
[10051.072985]  bio_free+0xe0/0x130
[10051.073630]  bio_put+0xab/0xe0
[10051.074252]  bio_endio+0x3a6/0x5d0
[10051.074984]  blk_update_request+0x590/0x1370
[10051.075870]  scsi_end_request+0x7d/0x400
[10051.076667]  scsi_io_completion+0x1aa/0xe50
[10051.077503]  scsi_softirq_done+0x11b/0x240
[10051.078344]  blk_mq_complete_request+0xd4/0x120
[10051.079275]  scsi_mq_done+0xf0/0x200
[10051.080036]  virtscsi_vq_done+0xbc/0x150
[10051.080850]  vring_interrupt+0x179/0x390
[10051.081650]  __handle_irq_event_percpu+0xf7/0x490
[10051.082626]  handle_irq_event_percpu+0x7b/0x160
[10051.083527]  handle_irq_event+0xcc/0x170
[10051.084297]  handle_edge_irq+0x215/0xb20
[10051.085122]  asm_call_irq_on_stack+0xf/0x20
[10051.085986]  common_interrupt+0xae/0x120
[10051.086830]  asm_common_interrupt+0x1e/0x40

==================================================================

Bio will be checked at beginning of submit_bio_noacct(). If bio needs
to be throttled, it will start the timer and stop submit bio directly.
Bio will submit in blk_throtl_dispatch_work_fn() when the timer expires.
But in the current process, if bio is throttled, it will still set bio
issue->value by blkcg_bio_issue_init(). This is redundant and may cause
the above use-after-free.

CPU0                                   CPU1
submit_bio
submit_bio_noacct
  submit_bio_checks
    blk_throtl_bio()
      <=mod_timer(&sq->pending_timer
                                      blk_throtl_dispatch_work_fn
                                        submit_bio_noacct() <= bio have
                                        throttle tag, will throw directly
                                        and bio issue->value will be set
                                        here

                                      bio_endio()
                                      bio_put()
                                      bio_free() <= free this bio

    blkcg_bio_issue_init(bio)
      <= bio has been freed and
      will lead to UAF
  return BLK_QC_T_NONE

Fix this by remove extra blkcg_bio_issue_init.

Fixes: e439bedf6b24 (blkcg: consolidate bio_issue_init() to be a part of core)
Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 63e9c0d78537..a50205bcb755 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -897,10 +897,8 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 	if (unlikely(!current->io_context))
 		create_task_io_context(current, GFP_ATOMIC, q->node);
 
-	if (blk_throtl_bio(bio)) {
-		blkcg_bio_issue_init(bio);
+	if (blk_throtl_bio(bio))
 		return false;
-	}
 
 	blk_cgroup_bio_start(bio);
 	blkcg_bio_issue_init(bio);
-- 
Gitee


From 2303356bfaa7b1f555df668a775379f973bc88eb Mon Sep 17 00:00:00 2001
From: Li Hua <hucool.lihua@huawei.com>
Date: Tue, 30 Nov 2021 16:32:07 +0800
Subject: [PATCH 123/134] sched/rt: Try to restart rt period timer when rt
 runtime exceeded

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

When rt_runtime is modified from -1 to a valid control value, it may
cause the task to be throttled all the time. Operations like the following
will trigger the bug. E.g:
1. echo -1 > /proc/sys/kernel/sched_rt_runtime_us
2. Run a FIFO task named A that executes while(1)
3. echo 950000 > /proc/sys/kernel/sched_rt_runtime_us

When rt_runtime is -1, The rt period timer will not be activated when task A
enqueued. And then the task will be throttled after setting rt_runtime to
950,000. The task will always be throttled because the rt period timer is not
activated.

Signed-off-by: Li Hua <hucool.lihua@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 kernel/sched/rt.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ad9bec90fb7a..dae1e8eaa983 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -944,6 +944,18 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	return rt_task_of(rt_se)->prio;
 }
 
+static inline void try_start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+	raw_spin_lock(&rt_b->rt_runtime_lock);
+	if (!rt_b->rt_period_active) {
+		rt_b->rt_period_active = 1;
+		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		hrtimer_start_expires(&rt_b->rt_period_timer,
+				      HRTIMER_MODE_ABS_PINNED_HARD);
+	}
+	raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+
 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
@@ -1020,13 +1032,17 @@ static void update_curr_rt(struct rq *rq)
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+		int exceeded;
 
 		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 			raw_spin_lock(&rt_rq->rt_runtime_lock);
 			rt_rq->rt_time += delta_exec;
-			if (sched_rt_runtime_exceeded(rt_rq))
+			exceeded = sched_rt_runtime_exceeded(rt_rq);
+			if (exceeded)
 				resched_curr(rq);
 			raw_spin_unlock(&rt_rq->rt_runtime_lock);
+			if (exceeded)
+				try_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
 		}
 	}
 }
@@ -2721,8 +2737,10 @@ static int sched_rt_global_validate(void)
 
 static void sched_rt_do_global(void)
 {
+	raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
 	def_rt_bandwidth.rt_runtime = global_rt_runtime();
 	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+	raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
 }
 
 int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
-- 
Gitee


From 0b15b21d90a7cc3dfdfb6239bdfbfa1f93624504 Mon Sep 17 00:00:00 2001
From: Luo Meng <luomeng12@huawei.com>
Date: Tue, 30 Nov 2021 16:32:09 +0800
Subject: [PATCH 124/134] locks: Fix UBSAN undefined behaviour in
 flock64_to_posix_lock

mainline inclusion
from mainline-v5.11-rc1
commit 16238415eb9886328a89fe7a3cb0b88c7335fe16
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=16238415eb9886328a89fe7a3cb0b88c7335fe16

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

When the sum of fl->fl_start and l->l_len overflows,
UBSAN shows the following warning:

UBSAN: Undefined behaviour in fs/locks.c:482:29
signed integer overflow: 2 + 9223372036854775806
cannot be represented in type 'long long int'
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xe4/0x14e lib/dump_stack.c:118
 ubsan_epilogue+0xe/0x81 lib/ubsan.c:161
 handle_overflow+0x193/0x1e2 lib/ubsan.c:192
 flock64_to_posix_lock fs/locks.c:482 [inline]
 flock_to_posix_lock+0x595/0x690 fs/locks.c:515
 fcntl_setlk+0xf3/0xa90 fs/locks.c:2262
 do_fcntl+0x456/0xf60 fs/fcntl.c:387
 __do_sys_fcntl fs/fcntl.c:483 [inline]
 __se_sys_fcntl fs/fcntl.c:468 [inline]
 __x64_sys_fcntl+0x12d/0x180 fs/fcntl.c:468
 do_syscall_64+0xc8/0x5a0 arch/x86/entry/common.c:293
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix it by parenthesizing 'l->l_len - 1'.

Signed-off-by: Luo Meng <luomeng12@huawei.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Luo Meng <luomeng12@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/locks.c b/fs/locks.c
index 32c948fe2944..6e118955b9b6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -542,7 +542,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
 	if (l->l_len > 0) {
 		if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
 			return -EOVERFLOW;
-		fl->fl_end = fl->fl_start + l->l_len - 1;
+		fl->fl_end = fl->fl_start + (l->l_len - 1);
 
 	} else if (l->l_len < 0) {
 		if (fl->fl_start + l->l_len < 0)
-- 
Gitee


From 6688d8b6e44745230d6dbfddea4cdc429be9c111 Mon Sep 17 00:00:00 2001
From: Huang Guobin <huangguobin4@huawei.com>
Date: Tue, 30 Nov 2021 16:32:13 +0800
Subject: [PATCH 125/134] bonding: Fix a use-after-free problem when
 bond_sysfs_slave_add() failed

mainline inclusion
from mainline
commit b93c6a911a3fe926b00add28f3b932007827c4ca
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b93c6a911a3fe926b00add28f3b932007827c4ca

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------

When I do fuzz test for bonding device interface, I got the following
use-after-free Calltrace:

==================================================================
BUG: KASAN: use-after-free in bond_enslave+0x1521/0x24f0
Read of size 8 at addr ffff88825bc11c00 by task ifenslave/7365

CPU: 5 PID: 7365 Comm: ifenslave Tainted: G            E     5.15.0-rc1+ #13
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014
Call Trace:
 dump_stack_lvl+0x6c/0x8b
 print_address_description.constprop.0+0x48/0x70
 kasan_report.cold+0x82/0xdb
 __asan_load8+0x69/0x90
 bond_enslave+0x1521/0x24f0
 bond_do_ioctl+0x3e0/0x450
 dev_ifsioc+0x2ba/0x970
 dev_ioctl+0x112/0x710
 sock_do_ioctl+0x118/0x1b0
 sock_ioctl+0x2e0/0x490
 __x64_sys_ioctl+0x118/0x150
 do_syscall_64+0x35/0xb0
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f19159cf577
Code: b3 66 90 48 8b 05 11 89 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 78
RSP: 002b:00007ffeb3083c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007ffeb3084bca RCX: 00007f19159cf577
RDX: 00007ffeb3083ce0 RSI: 0000000000008990 RDI: 0000000000000003
RBP: 00007ffeb3084bc4 R08: 0000000000000040 R09: 0000000000000000
R10: 00007ffeb3084bc0 R11: 0000000000000246 R12: 00007ffeb3083ce0
R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffeb3083cb0

Allocated by task 7365:
 kasan_save_stack+0x23/0x50
 __kasan_kmalloc+0x83/0xa0
 kmem_cache_alloc_trace+0x22e/0x470
 bond_enslave+0x2e1/0x24f0
 bond_do_ioctl+0x3e0/0x450
 dev_ifsioc+0x2ba/0x970
 dev_ioctl+0x112/0x710
 sock_do_ioctl+0x118/0x1b0
 sock_ioctl+0x2e0/0x490
 __x64_sys_ioctl+0x118/0x150
 do_syscall_64+0x35/0xb0
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Freed by task 7365:
 kasan_save_stack+0x23/0x50
 kasan_set_track+0x20/0x30
 kasan_set_free_info+0x24/0x40
 __kasan_slab_free+0xf2/0x130
 kfree+0xd1/0x5c0
 slave_kobj_release+0x61/0x90
 kobject_put+0x102/0x180
 bond_sysfs_slave_add+0x7a/0xa0
 bond_enslave+0x11b6/0x24f0
 bond_do_ioctl+0x3e0/0x450
 dev_ifsioc+0x2ba/0x970
 dev_ioctl+0x112/0x710
 sock_do_ioctl+0x118/0x1b0
 sock_ioctl+0x2e0/0x490
 __x64_sys_ioctl+0x118/0x150
 do_syscall_64+0x35/0xb0
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Last potentially related work creation:
 kasan_save_stack+0x23/0x50
 kasan_record_aux_stack+0xb7/0xd0
 insert_work+0x43/0x190
 __queue_work+0x2e3/0x970
 delayed_work_timer_fn+0x3e/0x50
 call_timer_fn+0x148/0x470
 run_timer_softirq+0x8a8/0xc50
 __do_softirq+0x107/0x55f

Second to last potentially related work creation:
 kasan_save_stack+0x23/0x50
 kasan_record_aux_stack+0xb7/0xd0
 insert_work+0x43/0x190
 __queue_work+0x2e3/0x970
 __queue_delayed_work+0x130/0x180
 queue_delayed_work_on+0xa7/0xb0
 bond_enslave+0xe25/0x24f0
 bond_do_ioctl+0x3e0/0x450
 dev_ifsioc+0x2ba/0x970
 dev_ioctl+0x112/0x710
 sock_do_ioctl+0x118/0x1b0
 sock_ioctl+0x2e0/0x490
 __x64_sys_ioctl+0x118/0x150
 do_syscall_64+0x35/0xb0
 entry_SYSCALL_64_after_hwframe+0x44/0xae

The buggy address belongs to the object at ffff88825bc11c00
 which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 0 bytes inside of
 1024-byte region [ffff88825bc11c00, ffff88825bc12000)
The buggy address belongs to the page:
page:ffffea00096f0400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x25bc10
head:ffffea00096f0400 order:3 compound_mapcount:0 compound_pincount:0
flags: 0x57ff00000010200(slab|head|node=1|zone=2|lastcpupid=0x7ff)
raw: 057ff00000010200 ffffea0009a71c08 ffff888240001968 ffff88810004dbc0
raw: 0000000000000000 00000000000a000a 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88825bc11b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff88825bc11b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88825bc11c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                   ^
 ffff88825bc11c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88825bc11d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

Put new_slave in bond_sysfs_slave_add() will cause use-after-free problems
when new_slave is accessed in the subsequent error handling process. Since
new_slave will be put in the subsequent error handling process, remove the
unnecessary put to fix it.
In addition, when sysfs_create_file() fails, if some files have been crea-
ted successfully, we need to call sysfs_remove_file() to remove them.
Since there are sysfs_create_files() & sysfs_remove_files() can be used,
use these two functions instead.

Fixes: 7afcaec49696 (bonding: use kobject_put instead of _del after kobject_add)
Signed-off-by: Huang Guobin <huangguobin4@huawei.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Huang Guobin <huangguobin4@huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 drivers/net/bonding/bond_sysfs_slave.c | 36 ++++++++------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c
index fd07561da034..6a6cdd0bb258 100644
--- a/drivers/net/bonding/bond_sysfs_slave.c
+++ b/drivers/net/bonding/bond_sysfs_slave.c
@@ -108,15 +108,15 @@ static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf)
 }
 static SLAVE_ATTR_RO(ad_partner_oper_port_state);
 
-static const struct slave_attribute *slave_attrs[] = {
-	&slave_attr_state,
-	&slave_attr_mii_status,
-	&slave_attr_link_failure_count,
-	&slave_attr_perm_hwaddr,
-	&slave_attr_queue_id,
-	&slave_attr_ad_aggregator_id,
-	&slave_attr_ad_actor_oper_port_state,
-	&slave_attr_ad_partner_oper_port_state,
+static const struct attribute *slave_attrs[] = {
+	&slave_attr_state.attr,
+	&slave_attr_mii_status.attr,
+	&slave_attr_link_failure_count.attr,
+	&slave_attr_perm_hwaddr.attr,
+	&slave_attr_queue_id.attr,
+	&slave_attr_ad_aggregator_id.attr,
+	&slave_attr_ad_actor_oper_port_state.attr,
+	&slave_attr_ad_partner_oper_port_state.attr,
 	NULL
 };
 
@@ -137,24 +137,10 @@ const struct sysfs_ops slave_sysfs_ops = {
 
 int bond_sysfs_slave_add(struct slave *slave)
 {
-	const struct slave_attribute **a;
-	int err;
-
-	for (a = slave_attrs; *a; ++a) {
-		err = sysfs_create_file(&slave->kobj, &((*a)->attr));
-		if (err) {
-			kobject_put(&slave->kobj);
-			return err;
-		}
-	}
-
-	return 0;
+	return sysfs_create_files(&slave->kobj, slave_attrs);
 }
 
 void bond_sysfs_slave_del(struct slave *slave)
 {
-	const struct slave_attribute **a;
-
-	for (a = slave_attrs; *a; ++a)
-		sysfs_remove_file(&slave->kobj, &((*a)->attr));
+	sysfs_remove_files(&slave->kobj, slave_attrs);
 }
-- 
Gitee


From 95e488ccf33b95419dc963e105feee06a2b44ea8 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Tue, 30 Nov 2021 16:32:15 +0800
Subject: [PATCH 126/134] block: avoid quiesce while elevator init

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------------

As blk_mq_quiesce_queue in elevator_init_mq will wait a RCU gap which want to
make sure no IO will happen while blk_mq_init_sched. If there is lots of
device will lead to boot slowly.
To address this issue, according to Lei Ming's suggestion:
"We are called before adding disk, when there isn't any FS I/O, so freezing
queue plus canceling dispatch work is enough to drain any dispatch activities
originated from passthrough requests, then no need to quiesce queue which may
add long boot latency, especially when lots of disks are involved."

Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-mq.c   | 13 +++++++++++++
 block/blk-mq.h   |  1 +
 block/elevator.c | 10 ++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1af4f9bbc0fe..a9aac1dd0f89 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3992,6 +3992,19 @@ unsigned int blk_mq_rq_cpu(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_rq_cpu);
 
+void blk_mq_cancel_work_sync(struct request_queue *q)
+{
+	if (queue_is_mq(q)) {
+		struct blk_mq_hw_ctx *hctx;
+		int i;
+
+		cancel_delayed_work_sync(&q->requeue_work);
+
+		queue_for_each_hw_ctx(q, hctx, i)
+			cancel_delayed_work_sync(&hctx->run_work);
+	}
+}
+
 static int __init blk_mq_init(void)
 {
 	int i;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f792a0920ebb..6f87c0681443 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -129,6 +129,7 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 
+void blk_mq_cancel_work_sync(struct request_queue *q);
 void blk_mq_release(struct request_queue *q);
 
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
diff --git a/block/elevator.c b/block/elevator.c
index 4ce6b22813a1..27eb70ec277a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -684,12 +684,18 @@ void elevator_init_mq(struct request_queue *q)
 	if (!e)
 		return;
 
+	/*
+	 * We are called before adding disk, when there isn't any FS I/O,
+	 * so freezing queue plus canceling dispatch work is enough to
+	 * drain any dispatch activities originated from passthrough
+	 * requests, then no need to quiesce queue which may add long boot
+	 * latency, especially when lots of disks are involved.
+	 */
 	blk_mq_freeze_queue(q);
-	blk_mq_quiesce_queue(q);
+	blk_mq_cancel_work_sync(q);
 
 	err = blk_mq_init_sched(q, e);
 
-	blk_mq_unquiesce_queue(q);
 	blk_mq_unfreeze_queue(q);
 
 	if (err) {
-- 
Gitee


From 2928914245903fffb12db115de3e46186b0757ee Mon Sep 17 00:00:00 2001
From: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Date: Tue, 30 Nov 2021 16:32:58 +0800
Subject: [PATCH 127/134] ima: Fix warning: no previous prototype for function
 'ima_add_kexec_buffer'

mainline inclusion
from mainline-5.14
commit: c67913492fec317bc53ffdff496b6ba856d2868c
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c67913492fec317bc53ffdff496b6ba856d2868c

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

The function prototype for ima_add_kexec_buffer() is present
in 'linux/ima.h'.  But this header file is not included in
ima_kexec.c where the function is implemented.  This results
in the following compiler warning when "-Wmissing-prototypes" flag
is turned on:

  security/integrity/ima/ima_kexec.c:81:6: warning: no previous prototype
  for function 'ima_add_kexec_buffer' [-Wmissing-prototypes]

Include the header file 'linux/ima.h' in ima_kexec.c to fix
the compiler warning.

Fixes: dce92f6b11c3 (arm64: Enable passing IMA log to next kernel on kexec)
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Guo Zihua <guozihua@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 security/integrity/ima/ima_kexec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index e29bea3dd4cc..a5ded568cc16 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -10,6 +10,7 @@
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/kexec.h>
+#include <linux/ima.h>
 #include "ima.h"
 
 #ifdef CONFIG_IMA_KEXEC
-- 
Gitee


From 40e990133f1a49d34a8e3b2716ac9c69a732ddce Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 30 Nov 2021 16:32:59 +0800
Subject: [PATCH 128/134] ext4: fix e2fsprogs checksum failure for mounted
 filesystem

mainline inclusion
from mainline-v5.15
commit b2bbb92f7042e8075fb036bf97043339576330c3
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b2bbb92f7042e8075fb036bf97043339576330c3

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

Commit 81414b4dd48 ("ext4: remove redundant sb checksum
recomputation") removed checksum recalculation after updating
superblock free space / inode counters in ext4_fill_super() based on
the fact that we will recalculate the checksum on superblock
writeout.

That is correct assumption but until the writeout happens (which can
take a long time) the checksum is incorrect in the buffer cache and if
programs such as tune2fs or resize2fs is called shortly after a file
system is mounted can fail.  So return back the checksum recalculation
and add a comment explaining why.

Fixes: 81414b4dd48f ("ext4: remove redundant sb checksum recomputation")
Cc: stable@kernel.org
Reported-by: Boyang Xue <bxue@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20210812124737.21981-1-jack@suse.cz
Signed-off-by: Zheng Liang <zhengliang6@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/super.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 16e228285967..68aa31dc30dd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5001,6 +5001,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
 					  GFP_KERNEL);
 	}
+	/*
+	 * Update the checksum after updating free space/inode
+	 * counters.  Otherwise the superblock can have an incorrect
+	 * checksum in the buffer cache until it is written out and
+	 * e2fsprogs programs trying to open a file system immediately
+	 * after it is mounted can fail.
+	 */
+	ext4_superblock_csum_set(sb);
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirs_counter,
 					  ext4_count_dirs(sb), GFP_KERNEL);
-- 
Gitee


From cfe5a4b1a35d07048e3713784ed6cd2bb24d258f Mon Sep 17 00:00:00 2001
From: Kevin Locke <kevin@kevinlocke.name>
Date: Tue, 30 Nov 2021 16:33:00 +0800
Subject: [PATCH 129/134] ovl: warn about orphan metacopy

mainline inclusion
from mainline-v5.11-rc1
commit 0a8d0b64dd6acfbc9e9b79022654bbe1ade4a29a
category: bugfix
issue: #I4PYGY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0a8d0b64dd6acfbc9e9b79022654bbe1ade4a29a

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-------------------------------------------------

When the lower file of a metacopy is inaccessible, -EIO is returned.  For
users not familiar with overlayfs internals, such as myself, the meaning of
this error may not be apparent or easy to determine, since the (metacopy)
file is present and open/stat succeed when accessed outside of the overlay.

Add a rate-limited warning for orphan metacopy to give users a hint when
investigating such errors.

Link: https://lore.kernel.org/linux-unionfs/CAOQ4uxi23Zsmfb4rCed1n=On0NNA5KZD74jjjeyz+et32sk-gg@mail.gmail.com/
Signed-off-by: Kevin Locke <kevin@kevinlocke.name>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Zheng Liang <zhengliang6@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

Signed-off-by: Chen Jun <chenjun102@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/overlayfs/namei.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 092812c2f118..b1c8a38573c9 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -1004,6 +1004,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 	 * Just make sure a corresponding data dentry has been found.
 	 */
 	if (d.metacopy || (uppermetacopy && !ctr)) {
+		pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n",
+				    dentry);
 		err = -EIO;
 		goto out_put;
 	} else if (!d.is_dir && upperdentry && !ctr && origin_path) {
-- 
Gitee


From 8f3ccc8a7e79a5b98828991e4526631a4d9f31b2 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 10 Dec 2021 18:22:53 +0800
Subject: [PATCH 130/134] block: return errors from blk_execute_rq()

mainline inclusion
from mainline-v5.14-rc1
commit  fb9b16e15cd70e21d8af7f03d700deb9509c2ce8
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
-----------------------------------------

The synchronous blk_execute_rq() had not provided a way for its callers
to know if its request was successful or not. Return the blk_status_t
result of the request.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Link: https://lore.kernel.org/r/20210610214437.641245-4-kbusch@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>

conflict: 1. in blkdev.h and blk-exec, blk_execute_rq return value
change;
	  2. input parameter in blk_execute_rq is not the same as
	     mainline;
Signed-off-by: zhangwensheng <zhangwensheng5@huawei.com>
Reviewed-by: qiulaibin <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/blk-exec.c       | 7 +++++--
 include/linux/blkdev.h | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 85324d53d072..b2676de4c6a5 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -21,7 +21,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
 {
 	struct completion *waiting = rq->end_io_data;
 
-	rq->end_io_data = NULL;
+	rq->end_io_data = (void *)(uintptr_t)error;
 
 	/*
 	 * complete last, if this is a stack request the process (and thus
@@ -75,8 +75,9 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  * Description:
  *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
+ *    Return: The blk_status_t result provided to blk_mq_end_request().
  */
-void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+blk_status_t blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
@@ -91,5 +92,7 @@ void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
 	else
 		wait_for_completion_io(&wait);
+
+	return (blk_status_t)(uintptr_t)rq->end_io_data;
 }
 EXPORT_SYMBOL(blk_execute_rq);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d6ae309a7dc3..31febd3e14de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -951,10 +951,10 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, const struct iov_iter *,
 			       gfp_t);
-extern void blk_execute_rq(struct request_queue *, struct gendisk *,
-			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
+blk_status_t blk_execute_rq(struct request_queue *, struct gendisk *,
+			    struct request *, int);
 
 /* Helper to convert REQ_OP_XXX to its string format XXX */
 extern const char *blk_op_str(unsigned int op);
-- 
Gitee


From 23a59bc599a3c7923bea4e2f82a4ff6e3ff8cf38 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 10 Dec 2021 18:22:51 +0800
Subject: [PATCH 131/134] bfq: Remove merged request already in
 bfq_requests_merged()

mainline inclusion
from mainline-v5.14-rc1
commit  a921c655f2033dd1ce1379128efe881dda23ea37
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

Currently, bfq does very little in bfq_requests_merged() and handles all
the request cleanup in bfq_finish_requeue_request() called from
blk_mq_free_request(). That is currently safe only because
blk_mq_free_request() is called shortly after bfq_requests_merged()
while bfqd->lock is still held. However to fix a lock inversion between
bfqd->lock and ioc->lock, we need to call blk_mq_free_request() after
dropping bfqd->lock. That would mean that already merged request could
be seen by other processes inside bfq queues and possibly dispatched to
the device which is wrong. So move cleanup of the request from
bfq_finish_requeue_request() to bfq_requests_merged().

Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210623093634.27879-2-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>

conflict: in bfq_finish_requeue_request, code in hulk have the line
              atomic_dec(&rq->mq_hctx->elevator_queued); that is conflicted;
Signed-off-by: zhangwensheng <zhangwensheng5@huawei.com>
Reviewed-by: qiulaibin <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/bfq-iosched.c | 41 +++++++++++++----------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fd3c23d516b8..27e01b4cd528 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2326,7 +2326,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 		*next_bfqq = bfq_init_rq(next);
 
 	if (!bfqq)
-		return;
+		goto remove;
 
 	/*
 	 * If next and rq belong to the same bfq_queue and next is older
@@ -2349,6 +2349,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 		bfqq->next_rq = rq;
 
 	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+remove:
+	/* Merged request may be in the IO scheduler. Remove it. */
+	if (!RB_EMPTY_NODE(&next->rb_node)) {
+		bfq_remove_request(next->q, next);
+		if (next_bfqq)
+			bfqg_stats_update_io_remove(bfqq_group(next_bfqq),
+						    next->cmd_flags);
+	}
 }
 
 /* Must be called with bfqq != NULL */
@@ -5901,6 +5909,7 @@ static void bfq_finish_requeue_request(struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd;
+	unsigned long flags;
 
 	/*
 	 * rq either is not associated with any icq, or is an already
@@ -5918,40 +5927,16 @@ static void bfq_finish_requeue_request(struct request *rq)
 					     rq->io_start_time_ns,
 					     rq->cmd_flags);
 
+	spin_lock_irqsave(&bfqd->lock, flags);
 	if (likely(rq->rq_flags & RQF_STARTED)) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&bfqd->lock, flags);
-
 		if (rq == bfqd->waited_rq)
 			bfq_update_inject_limit(bfqd, bfqq);
 
 		bfq_completed_request(bfqq, bfqd);
-		bfq_finish_requeue_request_body(bfqq);
 		atomic_dec(&rq->mq_hctx->elevator_queued);
-
-		spin_unlock_irqrestore(&bfqd->lock, flags);
-	} else {
-		/*
-		 * Request rq may be still/already in the scheduler,
-		 * in which case we need to remove it (this should
-		 * never happen in case of requeue). And we cannot
-		 * defer such a check and removal, to avoid
-		 * inconsistencies in the time interval from the end
-		 * of this function to the start of the deferred work.
-		 * This situation seems to occur only in process
-		 * context, as a consequence of a merge. In the
-		 * current version of the code, this implies that the
-		 * lock is held.
-		 */
-
-		if (!RB_EMPTY_NODE(&rq->rb_node)) {
-			bfq_remove_request(rq->q, rq);
-			bfqg_stats_update_io_remove(bfqq_group(bfqq),
-						    rq->cmd_flags);
-		}
-		bfq_finish_requeue_request_body(bfqq);
 	}
+	bfq_finish_requeue_request_body(bfqq);
+	spin_unlock_irqrestore(&bfqd->lock, flags);
 
 	/*
 	 * Reset private fields. In case of a requeue, this allows
-- 
Gitee


From 0cb8b002d2f4713c9bb0da10492f242238da0737 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 10 Dec 2021 18:22:52 +0800
Subject: [PATCH 132/134] blk: Fix lock inversion between ioc lock and bfqd
 lock

mainline inclusion
from mainline-v5.14-rc1
commit  fd2ef39cc9a6b9c4c41864ac506906c52f94b06a
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
------------------------------

Lockdep complains about lock inversion between ioc->lock and bfqd->lock:

bfqd -> ioc:
 put_io_context+0x33/0x90 -> ioc->lock grabbed
 blk_mq_free_request+0x51/0x140
 blk_put_request+0xe/0x10
 blk_attempt_req_merge+0x1d/0x30
 elv_attempt_insert_merge+0x56/0xa0
 blk_mq_sched_try_insert_merge+0x4b/0x60
 bfq_insert_requests+0x9e/0x18c0 -> bfqd->lock grabbed
 blk_mq_sched_insert_requests+0xd6/0x2b0
 blk_mq_flush_plug_list+0x154/0x280
 blk_finish_plug+0x40/0x60
 ext4_writepages+0x696/0x1320
 do_writepages+0x1c/0x80
 __filemap_fdatawrite_range+0xd7/0x120
 sync_file_range+0xac/0xf0

ioc->bfqd:
 bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed
 put_io_context_active+0x78/0xb0 -> ioc->lock grabbed
 exit_io_context+0x48/0x50
 do_exit+0x7e9/0xdd0
 do_group_exit+0x54/0xc0

To avoid this inversion we change blk_mq_sched_try_insert_merge() to not
free the merged request but rather leave that upto the caller similarly
to blk_mq_sched_try_merge(). And in bfq_insert_requests() we make sure
to free all the merged requests after dropping bfqd->lock.

Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210623093634.27879-3-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Conflict: 1. mainline include/linux/elevator.h file change;
          2. mainline block/mq-deadline-main.c file change, now change
             to block/mq-deadline.c;
Signed-off-by: zhangwensheng <zhangwensheng5@huawei.com>
Reviewed-by: qiulaibin <qiulaibin@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 block/bfq-iosched.c      |  6 ++++--
 block/blk-merge.c        | 19 ++++++++-----------
 block/blk-mq-sched.c     |  5 +++--
 block/blk-mq-sched.h     |  3 ++-
 block/blk-mq.h           | 11 +++++++++++
 block/blk.h              |  2 +-
 block/elevator.c         | 11 ++++++++---
 block/mq-deadline.c      |  5 ++++-
 include/linux/elevator.h |  3 ++-
 9 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 27e01b4cd528..aa1a808fa072 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2235,9 +2235,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
 
 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
 
+	spin_unlock_irq(&bfqd->lock);
 	if (free)
 		blk_mq_free_request(free);
-	spin_unlock_irq(&bfqd->lock);
 
 	return ret;
 }
@@ -5508,14 +5508,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
 	unsigned int cmd_flags;
+	LIST_HEAD(free);
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
 		bfqg_stats_update_legacy_io(q, rq);
 #endif
 	spin_lock_irq(&bfqd->lock);
-	if (blk_mq_sched_try_insert_merge(q, rq)) {
+	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
 		spin_unlock_irq(&bfqd->lock);
+		blk_mq_free_requests(&free);
 		return;
 	}
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 26f4bcc10de9..6518e0ae2835 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -831,18 +831,15 @@ static struct request *attempt_front_merge(struct request_queue *q,
 	return NULL;
 }
 
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
-			  struct request *next)
+/*
+ * Try to merge 'next' into 'rq'. Return true if the merge happened, false
+ * otherwise. The caller is responsible for freeing 'next' if the merge
+ * happened.
+ */
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+			   struct request *next)
 {
-	struct request *free;
-
-	free = attempt_merge(q, rq, next);
-	if (free) {
-		blk_put_request(free);
-		return 1;
-	}
-
-	return 0;
+	return attempt_merge(q, rq, next);
 }
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a3266541bd06..606bef13f1c2 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -381,9 +381,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 	return ret;
 }
 
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+				   struct list_head *free)
 {
-	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 0476360f05f1..15f3d611db10 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -12,7 +12,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs, struct request **merged_request);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs);
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+				   struct list_head *free);
 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6f87c0681443..7f3194657dff 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -283,6 +283,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
 	return NULL;
 }
 
+/* Free all requests on the list */
+static inline void blk_mq_free_requests(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct request *rq = list_entry_rq(list->next);
+
+		list_del_init(&rq->queuelist);
+		blk_mq_free_request(rq);
+	}
+}
+
 /*
  * For shared tag users, we track the number of currently active users
  * and attempt to provide a fair share of the tag depth for each of them.
diff --git a/block/blk.h b/block/blk.h
index ab99c522a31e..ca4de4985946 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -229,7 +229,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
 void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
 int ll_back_merge_fn(struct request *req, struct bio *bio,
 		unsigned int nr_segs);
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 				struct request *next);
 unsigned int blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
diff --git a/block/elevator.c b/block/elevator.c
index 27eb70ec277a..65dfc7559a36 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -353,9 +353,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
  * we can append 'rq' to an existing request, so we can throw 'rq' away
  * afterwards.
  *
- * Returns true if we merged, false otherwise
+ * Returns true if we merged, false otherwise. 'free' will contain all
+ * requests that need to be freed.
  */
-bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq,
+			      struct list_head *free)
 {
 	struct request *__rq;
 	bool ret;
@@ -366,8 +368,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 	/*
 	 * First try one-hit cache.
 	 */
-	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) {
+		list_add(&rq->queuelist, free);
 		return true;
+	}
 
 	if (blk_queue_noxmerges(q))
 		return false;
@@ -381,6 +385,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
 			break;
 
+		list_add(&rq->queuelist, free);
 		/* The merged request could be merged with others, try again */
 		ret = true;
 		rq = __rq;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index e4e90761eab3..43994cce1eb2 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -489,6 +489,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const int data_dir = rq_data_dir(rq);
+	LIST_HEAD(free);
 
 	/*
 	 * This may be a requeue of a write request that has locked its
@@ -496,8 +497,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	 */
 	blk_req_zone_write_unlock(rq);
 
-	if (blk_mq_sched_try_insert_merge(q, rq))
+	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
+		blk_mq_free_requests(&free);
 		return;
+	}
 
 	blk_mq_sched_request_inserted(rq);
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index bc26b4e11f62..0bb7489e0cfb 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *,
 		enum elv_merge);
-extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *,
+				     struct list_head *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 
-- 
Gitee


From d3e8d841116fb67568ee253e400f0279730ce634 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Fri, 3 Dec 2021 18:15:21 +0800
Subject: [PATCH 133/134] ext4: stop IO for page without buffer_head

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

dio_bio_complete will set page dirty without consider is there still
buffer_head valid with this page. This will trigger some problem while
ext4 try to writeback this page. For ext4, we fix it by skip writeback
the page without buffer_head.

[1] https://lwn.net/Articles/774411/ : "DMA and get_user_pages()"
[2] https://lwn.net/Articles/753027/ : "The Trouble with get_user_pages()"
[3] https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc1d8e7cca2daa18

Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>

Conflicts:
	fs/ext4/inode.c
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/ext4/inode.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dd137a1c648f..ef102cca42dc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1937,6 +1937,20 @@ static int __ext4_journalled_writepage(struct page *page,
 	return ret;
 }
 
+static void cancel_page_dirty_status(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	unsigned long flags;
+
+	cancel_dirty_page(page);
+	xa_lock_irqsave(&mapping->i_pages, flags);
+	__xa_clear_mark(&mapping->i_pages, page_index(page),
+			PAGECACHE_TAG_DIRTY);
+	__xa_clear_mark(&mapping->i_pages, page_index(page),
+			PAGECACHE_TAG_TOWRITE);
+	xa_unlock_irqrestore(&mapping->i_pages, flags);
+}
+
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -1995,6 +2009,12 @@ static int ext4_writepage(struct page *page,
 		return -EIO;
 	}
 
+	if (WARN_ON(!page_has_buffers(page))) {
+		cancel_page_dirty_status(page);
+		unlock_page(page);
+		return 0;
+	}
+
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_SHIFT &&
@@ -2603,6 +2623,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 				continue;
 			}
 
+			if (WARN_ON(!page_has_buffers(page))) {
+				cancel_page_dirty_status(page);
+				unlock_page(page);
+				continue;
+			}
+
 			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 
-- 
Gitee


From c84465a6d4e831ee170978f2814e7bc42c2f472a Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Fri, 26 Nov 2021 16:27:16 +0800
Subject: [PATCH 134/134] hugetlbfs: avoid overflow in hugetlbfs_fallocate

maillist inclusion
category: bugfix
issue: #I4PYGY
CVE: NA

Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---------------------------

luojiajun report a problem[1] two years ago which seems still exists in
mainline. vfs_fallocate can avoid 'offset + len' trigger overflow, but
'offset + len + hpage_size - 1' may overflow too and will lead to a
wrong 'end'. luojiajun give a solution which can fix the wrong 'end'
but leave the overflow still happened. Fix it with DIV_ROUND_UP_ULL.

[1] https://patchwork.kernel.org/project/linux-mm/patch/1554775226-67213-1-git-send-email-luojiajun3@huawei.com/

Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Yu Changchun <yuchangchun1@huawei.com>
---
 fs/hugetlbfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5fc9ccab907c..c7dec890f9d9 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -655,7 +655,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 	 * as well as being converted to page offsets.
 	 */
 	start = offset >> hpage_shift;
-	end = (offset + len + hpage_size - 1) >> hpage_shift;
+	end = DIV_ROUND_UP_ULL(offset + len, hpage_size);
 
 	inode_lock(inode);
 
-- 
Gitee