From 8af4501bfda56bb325a3222df0c1a50c347f64d6 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 19 Oct 2021 18:03:59 +0800 Subject: [PATCH 001/134] ext4: move inode eio simulation behind io completeion mainline inclusion from mainline-5.15-rc1 commit 0904c9ae3465c7acc066a564a76b75c0af83e6c7 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0904c9ae3465c7acc066a564a76b75c0af83e6c7 Signed-off-by: Yu Changchun --------------------------- No EIO simulation is required if the buffer is uptodate, so move the simulation behind read bio completeion just like inode/block bitmap simulation does. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210826130412.3921207-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 05ab7b65ea52..9fa1ed48b6b2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4341,8 +4341,6 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; - if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) - goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4429,8 +4427,8 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { - simulate_eio: if (ret_block) *ret_block = block; brelse(bh); -- Gitee From bde0f2d3354a044f4aa7bbe5506566679dcd9054 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 19 Oct 2021 18:04:00 +0800 Subject: [PATCH 002/134] ext4: make the updating inode data procedure atomic mainline inclusion from mainline-5.15-rc1 commit baaae979b112642a41b71c71c599d875c067d257 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=baaae979b112642a41b71c71c599d875c067d257 Signed-off-by: Yu Changchun --------------------------- Now that ext4_do_update_inode() return error before filling the whole inode data if we fail to set inode blocks in ext4_inode_blocks_set(). This error should never happen in theory since sb->s_maxbytes should not have allowed this, we have already init sb->s_maxbytes according to this feature in ext4_fill_super(). So even through that could only happen due to the filesystem corruption, we'd better to return after we finish updating the inode because it may left an uninitialized buffer and we could read this buffer later in "errors=continue" mode. This patch make the updating inode data procedure atomic, call EXT4_ERROR_INODE() after we dropping i_raw_lock after something bad happened, make sure that the inode is integrated, and also drop a BUG_ON and do some small cleanups. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210826130412.3921207-4-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/inode.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9fa1ed48b6b2..c5e0bd992b17 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4937,8 +4937,14 @@ static int ext4_inode_blocks_set(handle_t *handle, ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } + + /* + * This should never happen since sb->s_maxbytes should not have + * allowed this, sb->s_maxbytes was set according to the huge_file + * feature in ext4_fill_super(). + */ if (!ext4_has_feature_huge_file(sb)) - return -EFBIG; + return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* @@ -5045,16 +5051,14 @@ static int ext4_do_update_inode(handle_t *handle, spin_lock(&ei->i_raw_lock); - /* For fields not tracked in the in-memory inode, - * initialise them to zero for new inodes. */ + /* + * For fields not tracked in the in-memory inode, initialise them + * to zero for new inodes. + */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); err = ext4_inode_blocks_set(handle, raw_inode, ei); - if (err) { - spin_unlock(&ei->i_raw_lock); - goto out_brelse; - } raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); @@ -5063,10 +5067,11 @@ static int ext4_do_update_inode(handle_t *handle, if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ + /* + * Fix up interoperability with old kernels. Otherwise, + * old inodes get re-used with the upper 16 bits of the + * uid/gid intact. + */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; @@ -5135,8 +5140,9 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!ext4_has_feature_project(inode->i_sb) && - i_projid != EXT4_DEF_PROJID); + if (i_projid != EXT4_DEF_PROJID && + !ext4_has_feature_project(inode->i_sb)) + err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) @@ -5144,6 +5150,11 @@ static int ext4_do_update_inode(handle_t *handle, ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); + if (err) { + EXT4_ERROR_INODE(inode, "corrupted inode contents"); + goto out_brelse; + } + if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); @@ -5151,13 +5162,13 @@ static int ext4_do_update_inode(handle_t *handle, BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) - goto out_brelse; + goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) - goto out_brelse; + goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); @@ -5167,9 +5178,10 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); +out_error: + ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); - ext4_std_error(inode->i_sb, err); return err; } -- Gitee From a9adbf4104f9d01d90c6e0e2652c3e3f7c957caf Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 19 Oct 2021 18:04:01 +0800 Subject: [PATCH 003/134] ext4: factor out ext4_fill_raw_inode() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Factor out ext4_fill_raw_inode() from ext4_do_update_inode(), which is use to fill the in-mem inode contents into the inode table buffer, in preparation for initializing the exclusive inode buffer without reading the block in __ext4_get_inode_loc(). Signed-off-by: Zhang Yi Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/inode.c | 85 +++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c5e0bd992b17..cbc811ec1ef9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4919,9 +4919,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, return ERR_PTR(ret); } -static int ext4_inode_blocks_set(handle_t *handle, - struct ext4_inode *raw_inode, - struct ext4_inode_info *ei) +static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); @@ -5028,37 +5027,16 @@ static void ext4_update_other_inodes_time(struct super_block *sb, rcu_read_unlock(); } -/* - * Post the struct inode info into an on-disk inode location in the - * buffer-cache. This gobbles the caller's reference to the - * buffer_head in the inode location struct. - * - * The caller must have write access to iloc->bh. - */ -static int ext4_do_update_inode(handle_t *handle, - struct inode *inode, - struct ext4_iloc *iloc) +static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { - struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); - struct buffer_head *bh = iloc->bh; - struct super_block *sb = inode->i_sb; - int err = 0, block; - int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; projid_t i_projid; + int block; + int err; - spin_lock(&ei->i_raw_lock); - - /* - * For fields not tracked in the in-memory inode, initialise them - * to zero for new inodes. - */ - if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) - memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - - err = ext4_inode_blocks_set(handle, raw_inode, ei); + err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); @@ -5100,16 +5078,8 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) { - ext4_isize_set(raw_inode, ei->i_disksize); - need_datasync = 1; - } - if (ei->i_disksize > 0x7fffffffULL) { - if (!ext4_has_feature_large_file(sb) || - EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) - set_large_file = 1; - } + ext4_isize_set(raw_inode, ei->i_disksize); + raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { @@ -5149,6 +5119,45 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); + return err; +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; + int err; + int need_datasync = 0, set_large_file = 0; + + spin_lock(&ei->i_raw_lock); + + /* + * For fields not tracked in the in-memory inode, initialise them + * to zero for new inodes. + */ + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + + if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) + need_datasync = 1; + if (ei->i_disksize > 0x7fffffffULL) { + if (!ext4_has_feature_large_file(sb) || + EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; + } + + err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); -- Gitee From 321e36ab1c002398f545cff00bb9e572afa52366 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 19 Oct 2021 18:04:02 +0800 Subject: [PATCH 004/134] ext4: move ext4_fill_raw_inode() related functions maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- In preparation for calling ext4_fill_raw_inode() in __ext4_get_inode_loc(), move three related functions before __ext4_get_inode_loc(), no logical change. Signed-off-by: Zhang Yi Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/inode.c | 293 ++++++++++++++++++++++++------------------------ 1 file changed, 147 insertions(+), 146 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cbc811ec1ef9..1eefbc464964 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4303,6 +4303,153 @@ int ext4_truncate(struct inode *inode) return err; } +static inline u64 ext4_inode_peek_iversion(const struct inode *inode) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else + return inode_peek_iversion(inode); +} + +static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + struct inode *inode = &(ei->vfs_inode); + u64 i_blocks = READ_ONCE(inode->i_blocks); + struct super_block *sb = inode->i_sb; + + if (i_blocks <= ~0U) { + /* + * i_blocks can be represented in a 32 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = 0; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + return 0; + } + + /* + * This should never happen since sb->s_maxbytes should not have + * allowed this, sb->s_maxbytes was set according to the huge_file + * feature in ext4_fill_super(). + */ + if (!ext4_has_feature_huge_file(sb)) + return -EFSCORRUPTED; + + if (i_blocks <= 0xffffffffffffULL) { + /* + * i_blocks can be represented in a 48 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + } else { + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); + /* i_block is stored in file system block size */ + i_blocks = i_blocks >> (inode->i_blkbits - 9); + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + } + return 0; +} + +static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + uid_t i_uid; + gid_t i_gid; + projid_t i_projid; + int block; + int err; + + err = ext4_inode_blocks_set(raw_inode, ei); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); + if (!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); + /* + * Fix up interoperability with old kernels. Otherwise, + * old inodes get re-used with the upper 16 bits of the + * uid/gid intact. + */ + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(i_gid)); + } + } else { + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); + ext4_isize_set(raw_inode, ei->i_disksize); + + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else if (!ext4_has_inline_data(inode)) { + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + } + + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + u64 ivers = ext4_inode_peek_iversion(inode); + + raw_inode->i_disk_version = cpu_to_le32(ivers); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(ivers >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } + } + + if (i_projid != EXT4_DEF_PROJID && + !ext4_has_feature_project(inode->i_sb)) + err = err ?: -EFSCORRUPTED; + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + + ext4_inode_csum_set(inode, raw_inode, ei); + return err; +} + /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If 'in_mem' is true, we have all @@ -4597,13 +4744,6 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) else inode_set_iversion_queried(inode, val); } -static inline u64 ext4_inode_peek_iversion(const struct inode *inode) -{ - if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return inode_peek_iversion_raw(inode); - else - return inode_peek_iversion(inode); -} struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, @@ -4919,50 +5059,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, return ERR_PTR(ret); } -static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, - struct ext4_inode_info *ei) -{ - struct inode *inode = &(ei->vfs_inode); - u64 i_blocks = READ_ONCE(inode->i_blocks); - struct super_block *sb = inode->i_sb; - - if (i_blocks <= ~0U) { - /* - * i_blocks can be represented in a 32 bit variable - * as multiple of 512 bytes - */ - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = 0; - ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); - return 0; - } - - /* - * This should never happen since sb->s_maxbytes should not have - * allowed this, sb->s_maxbytes was set according to the huge_file - * feature in ext4_fill_super(). - */ - if (!ext4_has_feature_huge_file(sb)) - return -EFSCORRUPTED; - - if (i_blocks <= 0xffffffffffffULL) { - /* - * i_blocks can be represented in a 48 bit variable - * as multiple of 512 bytes - */ - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); - } else { - ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); - /* i_block is stored in file system block size */ - i_blocks = i_blocks >> (inode->i_blkbits - 9); - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - } - return 0; -} - static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, @@ -5027,101 +5123,6 @@ static void ext4_update_other_inodes_time(struct super_block *sb, rcu_read_unlock(); } -static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - uid_t i_uid; - gid_t i_gid; - projid_t i_projid; - int block; - int err; - - err = ext4_inode_blocks_set(raw_inode, ei); - - raw_inode->i_mode = cpu_to_le16(inode->i_mode); - i_uid = i_uid_read(inode); - i_gid = i_gid_read(inode); - i_projid = from_kprojid(&init_user_ns, ei->i_projid); - if (!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); - /* - * Fix up interoperability with old kernels. Otherwise, - * old inodes get re-used with the upper 16 bits of the - * uid/gid intact. - */ - if (ei->i_dtime && list_empty(&ei->i_orphan)) { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } else { - raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(i_uid)); - raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(i_gid)); - } - } else { - raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); - raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); - EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) - raw_inode->i_file_acl_high = - cpu_to_le16(ei->i_file_acl >> 32); - raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - ext4_isize_set(raw_inode, ei->i_disksize); - - raw_inode->i_generation = cpu_to_le32(inode->i_generation); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - raw_inode->i_block[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - raw_inode->i_block[1] = 0; - } else { - raw_inode->i_block[0] = 0; - raw_inode->i_block[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - raw_inode->i_block[2] = 0; - } - } else if (!ext4_has_inline_data(inode)) { - for (block = 0; block < EXT4_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - } - - if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - u64 ivers = ext4_inode_peek_iversion(inode); - - raw_inode->i_disk_version = cpu_to_le32(ivers); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(ivers >> 32); - raw_inode->i_extra_isize = - cpu_to_le16(ei->i_extra_isize); - } - } - - if (i_projid != EXT4_DEF_PROJID && - !ext4_has_feature_project(inode->i_sb)) - err = err ?: -EFSCORRUPTED; - - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) - raw_inode->i_projid = cpu_to_le32(i_projid); - - ext4_inode_csum_set(inode, raw_inode, ei); - return err; -} - /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the -- Gitee From c5dcb0a6f485a315337721e85e8dc321adac2588 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 19 Oct 2021 18:04:03 +0800 Subject: [PATCH 005/134] ext4: prevent getting empty inode buffer maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- In ext4_get_inode_loc(), we may skip IO and get an zero && uptodate inode buffer when the inode monopolize an inode block for performance reason. For most cases, ext4_mark_iloc_dirty() will fill the inode buffer to make it fine, but we could miss this call if something bad happened. Finally, __ext4_get_inode_loc_noinmem() may probably get an empty inode buffer and trigger ext4 error. For example, if we remove a nonexistent xattr on inode A, ext4_xattr_set_handle() will return ENODATA before invoking ext4_mark_iloc_dirty(), it will left an uptodate but zero buffer. We will get checksum error message in ext4_iget() when getting inode again. EXT4-fs error (device sda): ext4_lookup:1784: inode #131074: comm cat: iget: checksum invalid Even worse, if we allocate another inode B at the same inode block, it will corrupt the inode A on disk when write back inode B. So this patch initialize the inode buffer by filling the in-mem inode contents if we skip read I/O, ensure that the buffer is really uptodate. Signed-off-by: Zhang Yi Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/inode.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1eefbc464964..16bc65609496 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4452,12 +4452,12 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode /* * ext4_get_inode_loc returns with an extra refcount against the inode's - * underlying buffer_head on success. If 'in_mem' is true, we have all - * data in memory that is needed to recreate the on-disk version of this - * inode. + * underlying buffer_head on success. If we pass 'inode' and it does not + * have in-inode xattr, we have all inode data in memory that is needed + * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, - struct ext4_iloc *iloc, int in_mem, + struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; @@ -4502,7 +4502,7 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, * is the only valid inode in the block, we need not read the * block. */ - if (in_mem) { + if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; @@ -4530,8 +4530,13 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, } brelse(bitmap_bh); if (i == start + inodes_per_block) { + struct ext4_inode *raw_inode = + (struct ext4_inode *) (bh->b_data + iloc->offset); + /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); + if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) + ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; @@ -4593,7 +4598,7 @@ static int __ext4_get_inode_loc_noinmem(struct inode *inode, ext4_fsblk_t err_blk; int ret; - ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0, + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) @@ -4608,9 +4613,8 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) ext4_fsblk_t err_blk; int ret; - /* We have all inode data except xattrs in memory here. */ - ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, - !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk); + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, + &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, @@ -4623,7 +4627,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { - return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL); + return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) -- Gitee From d6aac746628fb1cfc72c0b95e39c3a87c5f8841c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 19 Oct 2021 18:04:04 +0800 Subject: [PATCH 006/134] blk-mq: fix divide by zero crash in tg_may_dispatch() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ----------------------------------------------- If blk-throttle is enabled and io is issued before blk_throtl_register_queue() is done. Divide by zero crash will be triggered in tg_may_dispatch() because 'throtl_slice' is uninitialized. Thus introduce a new flag QUEUE_FLAG_THROTL_INIT_DONE. It will be set after blk_throtl_register_queue() is done, and will be checked before applying any config. Signed-off-by: Yu Kuai Reviewed-by: Hou Tao Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-sysfs.c | 7 +++++++ block/blk-throttle.c | 37 ++++++++++++++++++++++++++++++++++++- include/linux/blkdev.h | 1 + 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b513f1683af0..66765740902b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -910,6 +910,9 @@ int blk_register_queue(struct gendisk *disk) blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); + spin_lock_irq(&q->queue_lock); + blk_queue_flag_set(QUEUE_FLAG_THROTL_INIT_DONE, q); + spin_unlock_irq(&q->queue_lock); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&q->kobj, KOBJ_ADD); @@ -942,6 +945,10 @@ void blk_unregister_queue(struct gendisk *disk) if (!blk_queue_registered(q)) return; + spin_lock_irq(&q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_THROTL_INIT_DONE, q); + spin_unlock_irq(&q->queue_lock); + /* * Since sysfs_remove_dir() prevents adding new directory entries * before removal of existing entries starts, protect against diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c53a254171a2..4087dcc13f7d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "blk.h" #include "blk-cgroup-rwstat.h" @@ -1456,6 +1457,31 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) } } +static inline int throtl_check_init_done(struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_THROTL_INIT_DONE, &q->queue_flags)) + return 0; + + return blk_queue_dying(q) ? -ENODEV : -EBUSY; +} + +/* + * If throtl_check_init_done() return -EBUSY, we should retry after a short + * msleep(), since that throttle init will be completed in blk_register_queue() + * soon. + */ +static inline int throtl_restart_syscall_when_busy(int errno) +{ + int ret = errno; + + if (ret == -EBUSY) { + msleep(10); + ret = restart_syscall(); + } + + return ret; +} + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1469,6 +1495,10 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, if (ret) return ret; + ret = throtl_check_init_done(ctx.disk->queue); + if (ret) + goto out_finish; + ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; @@ -1486,6 +1516,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, ret = 0; out_finish: blkg_conf_finish(&ctx); + ret = throtl_restart_syscall_when_busy(ret); return ret ?: nbytes; } @@ -1661,8 +1692,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, if (ret) return ret; - tg = blkg_to_tg(ctx.blkg); + ret = throtl_check_init_done(ctx.disk->queue); + if (ret) + goto out_finish; + tg = blkg_to_tg(ctx.blkg); v[0] = tg->bps_conf[READ][index]; v[1] = tg->bps_conf[WRITE][index]; v[2] = tg->iops_conf[READ][index]; @@ -1758,6 +1792,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, ret = 0; out_finish: blkg_conf_finish(&ctx); + ret = throtl_restart_syscall_when_busy(ret); return ret ?: nbytes; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8aae375864b6..7517381b5e15 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -593,6 +593,7 @@ struct request_queue { /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ +#define QUEUE_FLAG_THROTL_INIT_DONE 2 /* io throttle can be online */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -- Gitee From 7817c5defcc35d05ea67a4eb1dbd993f7dca7923 Mon Sep 17 00:00:00 2001 From: Yutian Yang Date: Tue, 19 Oct 2021 18:04:06 +0800 Subject: [PATCH 007/134] memcg: charge fs_context and legacy_fs_context mainline inclusion from mainline-v5.15-rc1 commit bb902cb47cf93b33cd92b3b7a4019330a03ef57f issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb902cb47cf93b33cd92b3b7a4019330a03ef57f Signed-off-by: Yu Changchun -------------------------------- This patch adds accounting flags to fs_context and legacy_fs_context allocation sites so that kernel could correctly charge these objects. We have written a PoC to demonstrate the effect of the missing-charging bugs. The PoC takes around 1,200MB unaccounted memory, while it is charged for only 362MB memory usage. We evaluate the PoC on QEMU x86_64 v5.2.90 + Linux kernel v5.10.19 + Debian buster. All the limitations including ulimits and sysctl variables are set as default. Specifically, the hard NOFILE limit and nr_open in sysctl are both 1,048,576. /*------------------------- POC code ----------------------------*/ } while (0) static inline int fsopen(const char *fs_name, unsigned int flags) { return syscall(__NR_fsopen, fs_name, flags); } static char thread_stack[512][STACK_SIZE]; int thread_fn(void* arg) { for (int i = 0; i< 800000; ++i) { int fsfd = fsopen("nfs", FSOPEN_CLOEXEC); if (fsfd == -1) { errExit("fsopen"); } } while(1); return 0; } int main(int argc, char *argv[]) { int thread_pid; for (int i = 0; i < 1; ++i) { thread_pid = clone(thread_fn, thread_stack[i] + STACK_SIZE, \ SIGCHLD, NULL); } while(1); return 0; } /*-------------------------- end --------------------------------*/ Link: https://lkml.kernel.org/r/1626517201-24086-1-git-send-email-nglaive@gmail.com Signed-off-by: Yutian Yang Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Li Ming Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/fs_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index 2834d1afa6e8..4858645ca620 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -231,7 +231,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct fs_context *fc; int ret = -ENOMEM; - fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); @@ -631,7 +631,7 @@ const struct fs_context_operations legacy_fs_context_ops = { */ static int legacy_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; -- Gitee From 0e98823f9a01bca481b6900ff921da071931a159 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 19 Oct 2021 18:04:07 +0800 Subject: [PATCH 008/134] memcg: enable accounting for mnt_cache entries mainline inclusion from mainline-v5.15-rc1 commit 79f6540ba88dfb383ecf057a3425e668105ca774 issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=79f6540ba88dfb383ecf057a3425e668105ca774 Signed-off-by: Yu Changchun -------------------------------- Patch series "memcg accounting from OpenVZ", v7. OpenVZ uses memory accounting 20+ years since v2.2.x linux kernels. Initially we used our own accounting subsystem, then partially committed it to upstream, and a few years ago switched to cgroups v1. Now we're rebasing again, revising our old patches and trying to push them upstream. We try to protect the host system from any misuse of kernel memory allocation triggered by untrusted users inside the containers. Patch-set is addressed mostly to cgroups maintainers and cgroups@ mailing list, though I would be very grateful for any comments from maintainersi of affected subsystems or other people added in cc: Compared to the upstream, we additionally account the following kernel objects: - network devices and its Tx/Rx queues - ipv4/v6 addresses and routing-related objects - inet_bind_bucket cache objects - VLAN group arrays - ipv6/sit: ip_tunnel_prl - scm_fp_list objects used by SCM_RIGHTS messages of Unix sockets - nsproxy and namespace objects itself - IPC objects: semaphores, message queues and share memory segments - mounts - pollfd and select bits arrays - signals and posix timers - file lock - fasync_struct used by the file lease code and driver's fasync queues - tty objects - per-mm LDT We have an incorrect/incomplete/obsoleted accounting for few other kernel objects: sk_filter, af_packets, netlink and xt_counters for iptables. They require rework and probably will be dropped at all. Also we're going to add an accounting for nft, however it is not ready yet. We have not tested performance on upstream, however, our performance team compares our current RHEL7-based production kernel and reports that they are at least not worse as the according original RHEL7 kernel. This patch (of 10): The kernel allocates ~400 bytes of 'struct mount' for any new mount. Creating a new mount namespace clones most of the parent mounts, and this can be repeated many times. Additionally, each mount allocates up to PATH_MAX=4096 bytes for mnt->mnt_devname. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/045db11f-4a45-7c9b-2664-5b32c2b44943@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Acked-by: Christian Brauner Cc: Tejun Heo Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Yutian Yang Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Kirill Tkhai Cc: Oleg Nesterov Cc: Serge Hallyn Cc: Thomas Gleixner Cc: Zefan Li Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Li Ming Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/namespace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 046b084136c5..7f1f89db511f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -183,7 +183,8 @@ static struct mount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup_const(name, + GFP_KERNEL_ACCOUNT); if (!mnt->mnt_devname) goto out_free_id; } @@ -3840,7 +3841,7 @@ void __init mnt_init(void) int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), -- Gitee From 047b4e2ffcfcbf652ef2f5ccb31a998b4b436d08 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 19 Oct 2021 18:04:08 +0800 Subject: [PATCH 009/134] memcg: enable accounting for fasync_cache mainline inclusion from mainline-v5.15-rc1 commit 839d68206de869b8cb4272c5ea10da2ef7ec34cb issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=839d68206de869b8cb4272c5ea10da2ef7ec34cb Signed-off-by: Yu Changchun -------------------------------- fasync_struct is used by almost all character device drivers to set up the fasync queue, and for regular files by the file lease code. This structure is quite small but long-living and it can be assigned for any open file. It makes sense to account for its allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/1b408625-d71c-0b26-b0b6-9baf00f93e69@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Li Ming Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/fcntl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 71b43538fa44..3eebcbead8bf 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1042,7 +1042,8 @@ static int __init fcntl_init(void) __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", - sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + sizeof(struct fasync_struct), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } -- Gitee From b5ff04ca7493cdf85e18c13bc4a429ab6e5d88b2 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 19 Oct 2021 18:04:09 +0800 Subject: [PATCH 010/134] memcg: enable accounting for new namesapces and struct nsproxy mainline inclusion from mainline-v5.15-rc1 commit 30acd0bdfb86548172168a0cc71d455944de0683 issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=30acd0bdfb86548172168a0cc71d455944de0683 Signed-off-by: Yu Changchun -------------------------------- Container admin can create new namespaces and force kernel to allocate up to several pages of memory for the namespaces and its associated structures. Net and uts namespaces have enabled accounting for such allocations. It makes sense to account for rest ones to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/5525bcbf-533e-da27-79b7-158686c64e13@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Serge Hallyn Acked-by: Christian Brauner Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: kernel/time/namespace.c Signed-off-by: Li Ming Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/namespace.c | 2 +- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 4 ++-- kernel/user_namespace.c | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 7f1f89db511f..6e76f2a72cfc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3283,7 +3283,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); diff --git a/ipc/namespace.c b/ipc/namespace.c index 24e7b45320f7..c94c05846141 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 812a61afd538..12c5110466bc 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) struct cgroup_namespace *new_ns; int ret; - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 12dd41b39a7f..3d5a5faf91b5 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -573,6 +573,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) int __init nsproxy_cache_init(void) { - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); + nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT); return 0; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ef8733e2a476..52c017feabcb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -457,7 +457,7 @@ const struct proc_ns_operations pidns_for_children_operations = { static __init int pid_namespaces_init(void) { - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index afc65e6be33e..00c20f7fdc02 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL); + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; kref_init(&ns->kref); - ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ce396ea4de60..2c15bf6680c3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1378,7 +1378,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { - user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); -- Gitee From 4361a3fb6e8229294b1fd2febb195cecb4afff07 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 19 Oct 2021 18:04:10 +0800 Subject: [PATCH 011/134] memcg: enable accounting for signals mainline inclusion from mainline-v5.15-rc1 commit 5f58c39819ff78ca5ddbba2b3cd8ff4779b19bb5 issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f58c39819ff78ca5ddbba2b3cd8ff4779b19bb5 Signed-off-by: Yu Changchun -------------------------------- When a user send a signal to any another processes it forces the kernel to allocate memory for 'struct sigqueue' objects. The number of signals is limited by RLIMIT_SIGPENDING resource limit, but even the default settings allow each user to consume up to several megabytes of memory. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/e34e958c-e785-712e-a62a-2c7b66c646c7@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Li Ming Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index ef8f2a28d37c..b125bc896138 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4595,7 +4595,7 @@ void __init signals_init(void) { siginfo_buildtime_checks(); - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB -- Gitee From 2e9dc96e1251320298b4e9d85b17e87268d6b492 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 19 Oct 2021 18:04:11 +0800 Subject: [PATCH 012/134] memcg: enable accounting for posix_timers_cache slab mainline inclusion from mainline-v5.15-rc1 commit c509723ec27e925bb91a20682c448e95d4bc8c9f issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c509723ec27e925bb91a20682c448e95d4bc8c9f Signed-off-by: Yu Changchun -------------------------------- A program may create multiple interval timers using timer_create(). For each timer the kernel preallocates a "queued real-time signal", Consequently, the number of timers is limited by the RLIMIT_SIGPENDING resource limit. The allocated object is quite small, ~250 bytes, but even the default signal limits allow to consume up to 100 megabytes per user. It makes sense to account for them to limit the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/57795560-025c-267c-6b1a-dea852d95530@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Thomas Gleixner Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Li Ming Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/time/posix-timers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..7363f81dc31a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); -- Gitee From 8dc576ac3d68be1a90ab9b633fc3e3a5f6844096 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 19 Oct 2021 18:04:12 +0800 Subject: [PATCH 013/134] memcg: enable accounting for ldt_struct objects mainline inclusion from mainline-v5.15-rc1 commit ec403e2ae0dfc85996aad6e944a98a16e6dfcc6d issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ec403e2ae0dfc85996aad6e944a98a16e6dfcc6d Signed-off-by: Yu Changchun -------------------------------- Each task can request own LDT and force the kernel to allocate up to 64Kb memory per-mm. There are legitimate workloads with hundreds of processes and there can be hundreds of workloads running on large machines. The unaccounted memory can cause isolation issues between the workloads particularly on highly utilized machines. It makes sense to account for this objects to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/38010594-50fe-c06d-7cb0-d1f77ca422f3@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Borislav Petkov Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Li Ming Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- arch/x86/kernel/ldt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b8aee71840ae..7694c541e3d8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) if (num_entries > LDT_ENTRIES) return NULL; - new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; @@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) - new_ldt->entries = vzalloc(alloc_size); + new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else - new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); -- Gitee From 7b79af940842ef215779b0d8011784af73ff8729 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 19 Oct 2021 18:04:15 +0800 Subject: [PATCH 014/134] kyber: introduce kyber_depth_updated() mainline inclusion from mainline-5.12-rc1 commit ffa772cfe9356ce94d3061335c2681f60e7c1c5b category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ffa772cfe9356ce94d3061335c2681f60e7c1c5b Signed-off-by: Yu Changchun ------------------------------------------------- Hang occurs when user changes the scheduler queue depth, by writing to the 'nr_requests' sysfs file of that device. The details of the environment that we found the problem are as follows: an eMMC block device total driver tags: 16 default queue_depth: 32 kqd->async_depth initialized in kyber_init_sched() with queue_depth=32 Then we change queue_depth to 256, by writing to the 'nr_requests' sysfs file. But kqd->async_depth don't be updated after queue_depth changes. Now the value of async depth is too small for queue_depth=256, this may cause hang. This patch introduces kyber_depth_updated(), so that kyber can update async depth when queue depth changes. Signed-off-by: Yang Yang Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe Signed-off-by: Yu Kuai Reviewed-by: Hou Tao Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/kyber-iosched.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 7f9ef773bf44..d2648356e430 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -353,19 +353,9 @@ static void kyber_timer_fn(struct timer_list *t) } } -static unsigned int kyber_sched_tags_shift(struct request_queue *q) -{ - /* - * All of the hardware queues have the same depth, so we can just grab - * the shift of the first one. - */ - return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; -} - static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) { struct kyber_queue_data *kqd; - unsigned int shift; int ret = -ENOMEM; int i; @@ -400,9 +390,6 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) kqd->latency_targets[i] = kyber_latency_targets[i]; } - shift = kyber_sched_tags_shift(q); - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; - return kqd; err_buckets: @@ -458,9 +445,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) INIT_LIST_HEAD(&kcq->rq_list[i]); } -static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags->sb.shift; + + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); +} + +static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ struct kyber_hctx_data *khd; int i; @@ -502,8 +499,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, - kqd->async_depth); + kyber_depth_updated(hctx); return 0; @@ -1023,6 +1019,7 @@ static struct elevator_type kyber_sched = { .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, + .depth_updated = kyber_depth_updated, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = kyber_queue_debugfs_attrs, -- Gitee From 18bf7fed5aff9bcf8aa461261dba0567862d5aec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Oct 2021 18:04:16 +0800 Subject: [PATCH 015/134] switch file_open_root() to struct path mainline inclusion from mainline-5.14-rc1 commit ffb37ca3bd16ce6ea2df2f87fde9a31e94ebb54b category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ffb37ca3bd16ce6ea2df2f87fde9a31e94ebb54b Signed-off-by: Yu Changchun --------------------------- ... and provide file_open_root_mnt(), using the root of given mount. Signed-off-by: Al Viro Conflicts: Documentation/filesystems/porting.rst [ Non-bugfix 14e43bf4356126("vfs: don't unnecessarily clone write access for writable fd") is not applied. ] Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Yu Changchun --- Documentation/filesystems/porting.rst | 9 +++++++++ arch/um/drivers/mconsole_kern.c | 2 +- fs/coredump.c | 4 ++-- fs/fhandle.c | 2 +- fs/internal.h | 2 +- fs/kernel_read_file.c | 2 +- fs/namei.c | 8 +++----- fs/open.c | 4 ++-- fs/proc/proc_sysctl.c | 2 +- include/linux/fs.h | 9 ++++++++- kernel/usermode_driver.c | 2 +- 11 files changed, 30 insertions(+), 16 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 867036aa90b8..b00fb6313b58 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -865,3 +865,12 @@ no matter what. Everything is handled by the caller. clone_private_mount() returns a longterm mount now, so the proper destructor of its result is kern_unmount() or kern_unmount_array(). + +--- + +**mandatory** + +Calling conventions for file_open_root() changed; now it takes struct path * +instead of passing mount and dentry separately. For callers that used to +pass mnt_root> pair (i.e. the root of given mount), a new helper +is provided - file_open_root_mnt(). In-tree users adjusted. diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index a2e680f7d39f..6a22ead31c5b 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req) mconsole_reply(req, "Proc not available", 1, 0); goto out; } - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); + file = file_open_root_mnt(mnt, ptr, O_RDONLY, 0); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file)); diff --git a/fs/coredump.c b/fs/coredump.c index c56a3bdce7cd..335c98787e66 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -755,8 +755,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cprm.file = file_open_root(root.dentry, root.mnt, - cn.corename, open_flags, 0600); + cprm.file = file_open_root(&root, cn.corename, + open_flags, 0600); path_put(&root); } else { cprm.file = filp_open(cn.corename, open_flags, 0600); diff --git a/fs/fhandle.c b/fs/fhandle.c index 01263ffbc4c0..718defdf1e0e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -229,7 +229,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); + file = file_open_root(&path, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/internal.h b/fs/internal.h index 5155f6ce95c7..0d4f7e4e2f3a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -128,7 +128,7 @@ struct open_flags { }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); -extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, +extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index c84d87f558cb..1b07550485b9 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -160,7 +160,7 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, get_fs_root(init_task.fs, &root); task_unlock(&init_task); - file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); + file = file_open_root(&root, path, O_RDONLY, 0); path_put(&root); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/namei.c b/fs/namei.c index 4c9d0c36545d..130aa5694f48 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3393,7 +3393,7 @@ struct file *do_filp_open(int dfd, struct filename *pathname, return filp; } -struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; @@ -3401,16 +3401,14 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, struct filename *filename; int flags = op->lookup_flags | LOOKUP_ROOT; - nd.root.mnt = mnt; - nd.root.dentry = dentry; - - if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) + if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); + nd.root = *root; set_nameidata(&nd, -1, filename); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) diff --git a/fs/open.c b/fs/open.c index 3aaaad47d9ca..7ce64c08bb40 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1149,7 +1149,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) } EXPORT_SYMBOL(filp_open); -struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; @@ -1157,7 +1157,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); - return do_file_open_root(dentry, mnt, filename, &op); + return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 070d2df8ab9c..ffed75f833b7 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1803,7 +1803,7 @@ static int process_sysctl_arg(char *param, char *val, panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); - file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) diff --git a/include/linux/fs.h b/include/linux/fs.h index c4e366a3226e..3de2e6c714aa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2527,8 +2528,14 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); -extern struct file *file_open_root(struct dentry *, struct vfsmount *, +extern struct file *file_open_root(const struct path *, const char *, int, umode_t); +static inline struct file *file_open_root_mnt(struct vfsmount *mnt, + const char *name, int flags, umode_t mode) +{ + return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, + name, flags, mode); +} extern struct file * dentry_open(const struct path *, int, const struct cred *); extern struct file * open_with_fake_path(const struct path *, int, struct inode*, const struct cred *); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index bb7bb3b478ab..9dae1f648713 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -26,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na if (IS_ERR(mnt)) return mnt; - file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); + file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); if (IS_ERR(file)) { mntput(mnt); return ERR_CAST(file); -- Gitee From 1c3315ab81dc4ae9599ed00ee3a574c1fccb35f1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Oct 2021 18:04:17 +0800 Subject: [PATCH 016/134] take LOOKUP_{ROOT,ROOT_GRABBED,JUMPED} out of LOOKUP_... space mainline inclusion from mainline-5.14-rc1 commit bcba1e7d0d520adba895d9e0800a056f734b0a6a category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bcba1e7d0d520adba895d9e0800a056f734b0a6a Signed-off-by: Yu Changchun --------------------------- Separate field in nameidata (nd->state) holding the flags that should be internal-only - that way we both get some spare bits in LOOKUP_... and get simpler rules for nd->root lifetime rules, since we can set the replacement of LOOKUP_ROOT (ND_ROOT_PRESET) at the same time we set nd->root. Signed-off-by: Al Viro Conflicts: fs/namei.c [ Bugfix 7d01ef7585c0("Make sure nd->path.mnt and nd->path.dentry are always valid pointers") is not applid, the problem to be fixed not exists. Feature 6c6ec2b0a3e0("fs: add support for LOOKUP_CACHED") is not applied. ] Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- Documentation/filesystems/path-lookup.rst | 6 +-- fs/namei.c | 54 +++++++++++++---------- fs/nfs/nfstrace.h | 4 -- include/linux/namei.h | 3 -- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst index c482e1619e77..ede67f705787 100644 --- a/Documentation/filesystems/path-lookup.rst +++ b/Documentation/filesystems/path-lookup.rst @@ -1321,18 +1321,18 @@ to lookup: RCU-walk, REF-walk, and REF-walk with forced revalidation. yet. This is primarily used to tell the audit subsystem the full context of a particular access being audited. -``LOOKUP_ROOT`` indicates that the ``root`` field in the ``nameidata`` was +``ND_ROOT_PRESET`` indicates that the ``root`` field in the ``nameidata`` was provided by the caller, so it shouldn't be released when it is no longer needed. -``LOOKUP_JUMPED`` means that the current dentry was chosen not because +``ND_JUMPED`` means that the current dentry was chosen not because it had the right name but for some other reason. This happens when following "``..``", following a symlink to ``/``, crossing a mount point or accessing a "``/proc/$PID/fd/$FD``" symlink (also known as a "magic link"). In this case the filesystem has not been asked to revalidate the name (with ``d_revalidate()``). In such cases the inode may still need to be revalidated, so ``d_op->d_weak_revalidate()`` is called if -``LOOKUP_JUMPED`` is set when the look completes - which may be at the +``ND_JUMPED`` is set when the look completes - which may be at the final component or, when creating, unlinking, or renaming, at the penultimate component. Resolution-restriction flags diff --git a/fs/namei.c b/fs/namei.c index 130aa5694f48..c94a814e86b2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -504,7 +504,7 @@ struct nameidata { struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ - unsigned int flags; + unsigned int flags, state; unsigned seq, m_seq, r_seq; int last_type; unsigned depth; @@ -523,6 +523,10 @@ struct nameidata { umode_t dir_mode; } __randomize_layout; +#define ND_ROOT_PRESET 1 +#define ND_ROOT_GRABBED 2 +#define ND_JUMPED 4 + static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; @@ -531,6 +535,7 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->name = name; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; + p->state = 0; current->nameidata = p; } @@ -593,9 +598,9 @@ static void terminate_walk(struct nameidata *nd) path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); - if (nd->flags & LOOKUP_ROOT_GRABBED) { + if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); - nd->flags &= ~LOOKUP_ROOT_GRABBED; + nd->state &= ~ND_ROOT_GRABBED; } } else { nd->flags &= ~LOOKUP_RCU; @@ -651,9 +656,9 @@ static bool legitimize_root(struct nameidata *nd) if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) return false; /* Nothing to do if nd->root is zero or is managed by the VFS user. */ - if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) + if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } @@ -790,8 +795,9 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) - nd->root.mnt = NULL; + if (!(nd->state & ND_ROOT_PRESET)) + if (!(nd->flags & LOOKUP_IS_SCOPED)) + nd->root.mnt = NULL; if (!try_to_unlazy(nd)) return -ECHILD; } @@ -817,7 +823,7 @@ static int complete_walk(struct nameidata *nd) return -EXDEV; } - if (likely(!(nd->flags & LOOKUP_JUMPED))) + if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) @@ -855,7 +861,7 @@ static int set_root(struct nameidata *nd) } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } return 0; } @@ -888,7 +894,7 @@ static int nd_jump_root(struct nameidata *nd) path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; } @@ -916,7 +922,7 @@ int nd_jump_link(struct path *path) path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; err: @@ -1338,7 +1344,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; *seqp = read_seqcount_begin(&dentry->d_seq); *inode = dentry->d_inode; /* @@ -1383,7 +1389,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); @@ -2129,7 +2135,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) case 2: if (name[1] == '.') { type = LAST_DOTDOT; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } break; case 1: @@ -2137,7 +2143,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) } if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; - nd->flags &= ~LOOKUP_JUMPED; + nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); @@ -2207,14 +2213,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->flags = flags | LOOKUP_JUMPED; + nd->flags = flags; + nd->state |= ND_JUMPED; nd->depth = 0; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (flags & LOOKUP_ROOT) { + if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2291,7 +2298,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root_seq = nd->seq; } else { path_get(&nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } } return s; @@ -2330,7 +2337,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); @@ -2354,11 +2361,11 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); + set_nameidata(&nd, dfd, name); if (unlikely(root)) { nd.root = *root; - flags |= LOOKUP_ROOT; + nd.state = ND_ROOT_PRESET; } - set_nameidata(&nd, dfd, name); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); @@ -3399,7 +3406,7 @@ struct file *do_file_open_root(const struct path *root, struct nameidata nd; struct file *file; struct filename *filename; - int flags = op->lookup_flags | LOOKUP_ROOT; + int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); @@ -3408,8 +3415,9 @@ struct file *do_file_open_root(const struct path *root, if (IS_ERR(filename)) return ERR_CAST(filename); - nd.root = *root; set_nameidata(&nd, -1, filename); + nd.root = *root; + nd.state = ND_ROOT_PRESET; file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 5a59dcdce0b2..cb7f49723bbf 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -271,8 +271,6 @@ TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET); -TRACE_DEFINE_ENUM(LOOKUP_JUMPED); -TRACE_DEFINE_ENUM(LOOKUP_ROOT); TRACE_DEFINE_ENUM(LOOKUP_EMPTY); TRACE_DEFINE_ENUM(LOOKUP_DOWN); @@ -288,8 +286,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \ - { LOOKUP_JUMPED, "JUMPED" }, \ - { LOOKUP_ROOT, "ROOT" }, \ { LOOKUP_EMPTY, "EMPTY" }, \ { LOOKUP_DOWN, "DOWN" }) diff --git a/include/linux/namei.h b/include/linux/namei.h index a4bb992623c4..ca94eb5d2b16 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -36,9 +36,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* internal use only */ #define LOOKUP_PARENT 0x0010 -#define LOOKUP_JUMPED 0x1000 -#define LOOKUP_ROOT 0x2000 -#define LOOKUP_ROOT_GRABBED 0x0008 /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ -- Gitee From baf3f0abd6b12d6044b4e8319a55bcae543614b5 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 19 Oct 2021 18:04:18 +0800 Subject: [PATCH 017/134] ext4: fix potential uninitialized access to retval in kmmpd mainline inclusion from mainline-5.14-rc5 commit b66541422824cf6cf20e9a35112e9cb5d82cdf62 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b66541422824cf6cf20e9a35112e9cb5d82cdf62 Signed-off-by: Yu Changchun ------------------------------------------------- if (!ext4_has_feature_mmp(sb)) then retval can be unitialized before we jump to the wait_to_exit label. Fixes: 61bb4a1c417e ("ext4: fix possible UAF when remounting r/o a mmp-protected file system") Signed-off-by: Ye Bin Link: https://lore.kernel.org/r/20210713022728.2533770-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index bc364c119af6..cebea4270817 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -138,7 +138,7 @@ static int kmmpd(void *data) unsigned mmp_check_interval; unsigned long last_update_time; unsigned long diff; - int retval; + int retval = 0; mmp_block = le64_to_cpu(es->s_mmp_block); mmp = (struct mmp_struct *)(bh->b_data); -- Gitee From fd1e38dc97d4ed4a9682775d5848d540408bafb6 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 21 Oct 2021 21:05:50 +0800 Subject: [PATCH 018/134] blk-mq-sched: Fix blk_mq_sched_alloc_tags() error handling mainline inclusion from mainline-v5.14-rc1 commit b93af3055d6f32d3b0361cfdb110c9399c1241ba category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b93af3055d6f32d3b0361cfdb110c9399c1241ba Signed-off-by: Yu Changchun --------------------------- If the blk_mq_sched_alloc_tags() -> blk_mq_alloc_rqs() call fails, then we call blk_mq_sched_free_tags() -> blk_mq_free_rqs(). It is incorrect to do so, as any rqs would have already been freed in the blk_mq_alloc_rqs() call. Fix by calling blk_mq_free_rq_map() only directly. Fixes: 6917ff0b5bd41 ("blk-mq-sched: refactor scheduler initialization") Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1627378373-148090-1-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe Signed-off-by: Laibin Qiu Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-mq-sched.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 24c08963890e..3fea30f0ea27 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -506,19 +506,6 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_put(&q->q_usage_counter); } -static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - - if (hctx->sched_tags) { - blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags, flags); - hctx->sched_tags = NULL; - } -} - static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) @@ -534,8 +521,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, return -ENOMEM; ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) - blk_mq_sched_free_tags(set, hctx, hctx_idx); + if (ret) { + blk_mq_free_rq_map(hctx->sched_tags, set->flags); + hctx->sched_tags = NULL; + } return ret; } -- Gitee From f856741e4ea1481df28ad464481ed83fe2965fa1 Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Tue, 28 Sep 2021 11:51:49 +0800 Subject: [PATCH 019/134] mm: fix page_owner initializing issue for arm32 mainline inclusion from mainline-v5.11-rc1 commit 7fb7ab6d618a4dc7ea3f3eafc92388a35b4f8894 category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ------------------------------------------------- Page owner of pages used by page owner itself used is missing on arm32 targets. The reason is dummy_handle and failure_handle is not initialized correctly. Buddy allocator is used to initialize these two handles. However, buddy allocator is not ready when page owner calls it. This change fixed that by initializing page owner after buddy initialization. The working flow before and after this change are: original logic: 1. allocated memory for page_ext(using memblock). 2. invoke the init callback of page_ext_ops like page_owner(using buddy allocator). 3. initialize buddy. after this change: 1. allocated memory for page_ext(using memblock). 2. initialize buddy. 3. invoke the init callback of page_ext_ops like page_owner(using buddy allocator). with the change, failure/dummy_handle can get its correct value and page owner output for example has the one for page owner itself: Page allocated via order 2, mask 0x6202c0(GFP_USER|__GFP_NOWARN), pid 1006, ts 67278156558 ns PFN 543776 type Unmovable Block 531 type Unmovable Flags 0x0() init_page_owner+0x28/0x2f8 invoke_init_callbacks_flatmem+0x24/0x34 start_kernel+0x33c/0x5d8 Link: https://lkml.kernel.org/r/1603104925-5888-1-git-send-email-zhenhuah@codeaurora.org Signed-off-by: Zhenhua Huang Acked-by: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 7fb7ab6d618a4dc7ea3f3eafc92388a35b4f8894) Signed-off-by: Kefeng Wang Signed-off-by: Yuanzheng Song Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/page_ext.h | 8 ++++++++ init/main.c | 2 ++ mm/page_ext.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index cfce186f0c4e..aff81ba31bd8 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -44,8 +44,12 @@ static inline void page_ext_init_flatmem(void) { } extern void page_ext_init(void); +static inline void page_ext_init_flatmem_late(void) +{ +} #else extern void page_ext_init_flatmem(void); +extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } @@ -76,6 +80,10 @@ static inline void page_ext_init(void) { } +static inline void page_ext_init_flatmem_late(void) +{ +} + static inline void page_ext_init_flatmem(void) { } diff --git a/init/main.c b/init/main.c index dd26a42e80a8..63cdf49f7f82 100644 --- a/init/main.c +++ b/init/main.c @@ -828,6 +828,8 @@ static void __init mm_init(void) init_debug_pagealloc(); report_meminit(); mem_init(); + /* page_owner must be initialized after buddy is ready */ + page_ext_init_flatmem_late(); kmem_cache_init(); kmemleak_init(); pgtable_init(); diff --git a/mm/page_ext.c b/mm/page_ext.c index a3616f7a0e9e..16b161f28a31 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -99,12 +99,19 @@ static void __init invoke_init_callbacks(void) } } +#ifndef CONFIG_SPARSEMEM +void __init page_ext_init_flatmem_late(void) +{ + invoke_init_callbacks(); +} +#endif + static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } -#if !defined(CONFIG_SPARSEMEM) +#ifndef CONFIG_SPARSEMEM void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) @@ -177,7 +184,6 @@ void __init page_ext_init_flatmem(void) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); - invoke_init_callbacks(); return; fail: -- Gitee From b7e60f215e0d7160e5803db914874427a3e80d89 Mon Sep 17 00:00:00 2001 From: Liam Mark Date: Tue, 6 Jul 2021 11:33:56 +0000 Subject: [PATCH 020/134] mm/page_owner: record timestamp and pid mainline inclusion from mainline-5.11-rc1 commit 9cc7e96aa846f9086431d6c2d33ff9ab42d72b2d category: feature issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9cc7e96aa846f9086431d6c2d33ff9ab42d72b2d Signed-off-by: Yu Changchun ------------------------------------------------- Collect the time for each allocation recorded in page owner so that allocation "surges" can be measured. Record the pid for each allocation recorded in page owner so that the source of allocation "surges" can be better identified. The above is very useful when doing memory analysis. On a crash for example, we can get this information from kdump (or ramdump) and parse it to figure out memory allocation problems. Please note that on x86_64 this increases the size of struct page_owner from 16 bytes to 32. Vlastimil: it's not a functionality intended for production, so unless somebody says they need to enable page_owner for debugging and this increase prevents them from fitting into available memory, let's not complicate things with making this optional. [lmark@codeaurora.org: v3] Link: https://lkml.kernel.org/r/20201210160357.27779-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20201209125153.10533-1-georgi.djakov@linaro.org Signed-off-by: Liam Mark Signed-off-by: Georgi Djakov Acked-by: Vlastimil Babka Acked-by: Joonsoo Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 9cc7e96aa846f9086431d6c2d33ff9ab42d72b2d) Signed-off-by: Yongqiang Liu Reviewed-by: tong tiangen Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- Documentation/vm/page_owner.rst | 12 ++++++------ mm/page_owner.c | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 02deac76673f..4e67c2e9bbed 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -41,17 +41,17 @@ size change due to this facility. - Without page owner:: text data bss dec hex filename - 40662 1493 644 42799 a72f mm/page_alloc.o + 48392 2333 644 51369 c8a9 mm/page_alloc.o - With page owner:: text data bss dec hex filename - 40892 1493 644 43029 a815 mm/page_alloc.o - 1427 24 8 1459 5b3 mm/page_ext.o - 2722 50 0 2772 ad4 mm/page_owner.o + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6574 108 29 6711 1a37 mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o -Although, roughly, 4 KB code is added in total, page_alloc.o increase by -230 bytes and only half of it is in hotpath. Building the kernel with +Although, roughly, 8 KB code is added in total, page_alloc.o increase by +520 bytes and less than half of it is in hotpath. Building the kernel with page owner and turning it on if needed would be great option to debug kernel memory problem. diff --git a/mm/page_owner.c b/mm/page_owner.c index b735a8eafcdb..af464bb7fbe7 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internal.h" @@ -25,6 +26,8 @@ struct page_owner { gfp_t gfp_mask; depot_stack_handle_t handle; depot_stack_handle_t free_handle; + u64 ts_nsec; + pid_t pid; }; static bool page_owner_enabled = false; @@ -172,6 +175,8 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; + page_owner->pid = current->pid; + page_owner->ts_nsec = local_clock(); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -236,6 +241,8 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->last_migrate_reason = old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; + new_page_owner->pid = old_page_owner->pid; + new_page_owner->ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -349,9 +356,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg)\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n", page_owner->order, page_owner->gfp_mask, - &page_owner->gfp_mask); + &page_owner->gfp_mask, page_owner->pid, + page_owner->ts_nsec); if (ret >= count) goto err; @@ -427,8 +435,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, + page_owner->pid, page_owner->ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { -- Gitee From 0f69ec3839b52fb1fc968fc7186aab8b74da07ef Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Tue, 6 Jul 2021 11:33:57 +0000 Subject: [PATCH 021/134] mm/page_owner: record the timestamp of all pages during free mainline inclusion from mainline-5.13-rc1 commit 866b485262173a2b873386162b2ddcfbcb542b4a category: feature issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=866b485262173a2b873386162b2ddcfbcb542b4a Signed-off-by: Yu Changchun ------------------------------------------------- Collect the time when each allocation is freed, to help with memory analysis with kdump/ramdump. Add the timestamp also in the page_owner debugfs file and print it in dump_page(). Having another timestamp when we free the page helps for debugging page migration issues. For example both alloc and free timestamps being the same can gave hints that there is an issue with migrating memory, as opposed to a page just being dropped during migration. Link: https://lkml.kernel.org/r/20210203175905.12267-1-georgi.djakov@linaro.org Signed-off-by: Georgi Djakov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 866b485262173a2b873386162b2ddcfbcb542b4a) Signed-off-by: Yongqiang Liu Reviewed-by: tong tiangen Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- Documentation/vm/page_owner.rst | 2 +- mm/page_owner.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 4e67c2e9bbed..2175465c9bf2 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -47,7 +47,7 @@ size change due to this facility. text data bss dec hex filename 48800 2445 644 51889 cab1 mm/page_alloc.o - 6574 108 29 6711 1a37 mm/page_owner.o + 6662 108 29 6799 1a8f mm/page_owner.o 1025 8 8 1041 411 mm/page_ext.o Although, roughly, 8 KB code is added in total, page_alloc.o increase by diff --git a/mm/page_owner.c b/mm/page_owner.c index af464bb7fbe7..5b93fc85dc73 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -27,6 +27,7 @@ struct page_owner { depot_stack_handle_t handle; depot_stack_handle_t free_handle; u64 ts_nsec; + u64 free_ts_nsec; pid_t pid; }; @@ -148,6 +149,7 @@ void __reset_page_owner(struct page *page, unsigned int order) struct page_ext *page_ext; depot_stack_handle_t handle = 0; struct page_owner *page_owner; + u64 free_ts_nsec = local_clock(); handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); @@ -158,6 +160,7 @@ void __reset_page_owner(struct page *page, unsigned int order) __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; + page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } } @@ -243,6 +246,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; + new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -356,10 +360,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->ts_nsec); + page_owner->ts_nsec, page_owner->free_ts_nsec); if (ret >= count) goto err; @@ -435,9 +439,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec); + page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { -- Gitee From 908db6a575229e548cf86607d2a019bd408f71f6 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Tue, 28 Sep 2021 11:51:51 +0800 Subject: [PATCH 022/134] mm: page_poison: print page info when corruption is caught mainline inclusion from mainline-v5.13-rc1 commit f58bd538e6a2deb2bcdfe527d9ed45643348a4e6 category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ------------------------------------------------- When page_poison detects page corruption it's useful to see who freed a page recently to have a guess where write-after-free corruption happens. After this change corruption report has extra page data. Example report from real corruption (includes only page_pwner part): pagealloc: memory corruption e00000014cd61d10: 11 00 00 00 00 00 00 00 30 1d d2 ff ff 0f 00 60 ........0......` e00000014cd61d20: b0 1d d2 ff ff 0f 00 60 90 fe 1c 00 08 00 00 20 .......`....... ... CPU: 1 PID: 220402 Comm: cc1plus Not tainted 5.12.0-rc5-00107-g9720c6f59ecf #245 Hardware name: hp server rx3600, BIOS 04.03 04/08/2008 ... Call Trace: [] show_stack+0x90/0xc0 [] dump_stack+0x150/0x1c0 [] __kernel_unpoison_pages+0x410/0x440 [] get_page_from_freelist+0x1460/0x2ca0 [] __alloc_pages_nodemask+0x3c0/0x660 [] alloc_pages_vma+0xb0/0x500 [] __handle_mm_fault+0x1230/0x1fe0 [] handle_mm_fault+0x310/0x4e0 [] ia64_do_page_fault+0x1f0/0xb80 [] ia64_leave_kernel+0x0/0x270 page_owner tracks the page as freed page allocated via order 0, migratetype Movable, gfp_mask 0x100dca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), pid 37, ts 8173444098740 __reset_page_owner+0x40/0x200 free_pcp_prepare+0x4d0/0x600 free_unref_page+0x20/0x1c0 __put_page+0x110/0x1a0 migrate_pages+0x16d0/0x1dc0 compact_zone+0xfc0/0x1aa0 proactive_compact_node+0xd0/0x1e0 kcompactd+0x550/0x600 kthread+0x2c0/0x2e0 call_payload+0x50/0x80 Here we can see that page was freed by page migration but something managed to write to it afterwards. [slyfox@gentoo.org: s/dump_page_owner/dump_page/, per Vlastimil] Link: https://lkml.kernel.org/r/20210407230800.1086854-1-slyfox@gentoo.org Link: https://lkml.kernel.org/r/20210404141735.2152984-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit f58bd538e6a2deb2bcdfe527d9ed45643348a4e6) Signed-off-by: Yuanzheng Song Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- mm/page_poison.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index a08423cb11ca..51f9cb913e0c 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -73,7 +74,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b) return error && !(error & (error - 1)); } -static void check_poison_mem(unsigned char *mem, size_t bytes) +static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes) { static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); unsigned char *start; @@ -101,6 +102,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); dump_stack(); + dump_page(page, "pagealloc: corrupted page details"); } static void unpoison_page(struct page *page) @@ -114,7 +116,7 @@ static void unpoison_page(struct page *page) * that is freed to buddy. Thus no extra check is done to * see if a page was poisoned. */ - check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE); + check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE); kasan_enable_current(); kunmap_atomic(addr); } -- Gitee From f23a7b4f94a206cdfda24b6cd5029216f111323d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 21 Oct 2021 21:05:55 +0800 Subject: [PATCH 023/134] mm: vmscan: fix missing psi annotation for node_reclaim() mainline inclusion from mainline-5.14-rc7 commit 57f29762cdd4687a02f245d1b1e78de046388eac category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=57f29762cdd4687a02f245d1b1e78de046388eac Signed-off-by: Yu Changchun ------------------------------------------------ In a debugging session the other day, Rik noticed that node_reclaim() was missing memstall annotations. This means we'll miss pressure and lost productivity resulting from reclaim on an overloaded local NUMA node when vm.zone_reclaim_mode is enabled. There haven't been any reports, but that's likely because vm.zone_reclaim_mode hasn't been a commonly used feature recently, and the intersection between such setups and psi users is probably nil. But secondary memory such as CXL-connected DIMMS, persistent memory etc, and the page demotion patches that handle them (https://lore.kernel.org/lkml/20210401183216.443C4443@viggo.jf.intel.com/) could soon make this a more common codepath again. Link: https://lkml.kernel.org/r/20210818152457.35846-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Rik van Riel Reviewed-by: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Jing Xiangfeng Reviewed-by: Chen Wandun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6a6dd347bcb3..ebaa10d4dcad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4174,11 +4174,13 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in .may_swap = 1, .reclaim_idx = gfp_zone(gfp_mask), }; + unsigned long pflags; trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, sc.gfp_mask); cond_resched(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP @@ -4203,6 +4205,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); + psi_memstall_leave(&pflags); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); -- Gitee From 8f26c74c12a9d5958b76b5cad735893a8973c3af Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 21 Oct 2021 21:05:56 +0800 Subject: [PATCH 024/134] mm: slub: fix slub_debug disabling for list of slabs mainline inclusion from mainline-5.14-rc6 commit a7f1d48585b34730765dcda09ead6edc4ac16a5c category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a7f1d48585b34730765dcda09ead6edc4ac16a5c Signed-off-by: Yu Changchun ------------------------------------------------ Vijayanand Jitta reports: Consider the scenario where CONFIG_SLUB_DEBUG_ON is set and we would want to disable slub_debug for few slabs. Using boot parameter with slub_debug=-,slab_name syntax doesn't work as expected i.e; only disabling debugging for the specified list of slabs. Instead it disables debugging for all slabs, which is wrong. This patch fixes it by delaying the moment when the global slub_debug flags variable is updated. In case a "slub_debug=-,slab_name" has been passed, the global flags remain as initialized (depending on CONFIG_SLUB_DEBUG_ON enabled or disabled) and are not simply reset to 0. Link: https://lkml.kernel.org/r/8a3d992a-473a-467b-28a0-4ad2ff60ab82@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Vijayanand Jitta Reviewed-by: Vijayanand Jitta Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: mm/slub.c Signed-off-by: Jing Xiangfeng Reviewed-by: Chen Wandun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- mm/slub.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1384dc906833..e6c532123de1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1346,12 +1346,13 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) static int __init setup_slub_debug(char *str) { slab_flags_t flags; + slab_flags_t global_flags; char *saved_str; char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; - slub_debug = DEBUG_DEFAULT_FLAGS; + global_flags = DEBUG_DEFAULT_FLAGS; if (*str++ != '=' || !*str) /* * No options specified. Switch on full debugging. @@ -1363,7 +1364,7 @@ static int __init setup_slub_debug(char *str) str = parse_slub_debug_flags(str, &flags, &slab_list, true); if (!slab_list) { - slub_debug = flags; + global_flags = flags; global_slub_debug_changed = true; } else { slab_list_specified = true; @@ -1372,16 +1373,18 @@ static int __init setup_slub_debug(char *str) /* * For backwards compatibility, a single list of flags with list of - * slabs means debugging is only enabled for those slabs, so the global - * slub_debug should be 0. We can extended that to multiple lists as + * slabs means debugging is only changed for those slabs, so the global + * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ if (slab_list_specified) { if (!global_slub_debug_changed) - slub_debug = 0; + global_flags = slub_debug; slub_debug_string = saved_str; } out: + slub_debug = global_flags; if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); if ((static_branch_unlikely(&init_on_alloc) || -- Gitee From cab36ab54790cecc58a9bc3d8458823360f1f396 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Thu, 21 Oct 2021 21:06:02 +0800 Subject: [PATCH 025/134] ext4: flush s_error_work before journal destroy in ext4_fill_super mainline inclusion from mainline-v5.15-rc4 commit bb9464e08309f6befe80866f5be51778ca355ee9 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb9464e08309f6befe80866f5be51778ca355ee9 Signed-off-by: Yu Changchun --------------------------- The error path in ext4_fill_super forget to flush s_error_work before journal destroy, and it may trigger the follow bug since flush_stashed_error_work can run concurrently with journal destroy without any protection for sbi->s_journal. [32031.740193] EXT4-fs (loop66): get root inode failed [32031.740484] EXT4-fs (loop66): mount failed [32031.759805] ------------[ cut here ]------------ [32031.759807] kernel BUG at fs/jbd2/transaction.c:373! [32031.760075] invalid opcode: 0000 [#1] SMP PTI [32031.760336] CPU: 5 PID: 1029268 Comm: kworker/5:1 Kdump: loaded 4.18.0 [32031.765112] Call Trace: [32031.765375] ? __switch_to_asm+0x35/0x70 [32031.765635] ? __switch_to_asm+0x41/0x70 [32031.765893] ? __switch_to_asm+0x35/0x70 [32031.766148] ? __switch_to_asm+0x41/0x70 [32031.766405] ? _cond_resched+0x15/0x40 [32031.766665] jbd2__journal_start+0xf1/0x1f0 [jbd2] [32031.766934] jbd2_journal_start+0x19/0x20 [jbd2] [32031.767218] flush_stashed_error_work+0x30/0x90 [ext4] [32031.767487] process_one_work+0x195/0x390 [32031.767747] worker_thread+0x30/0x390 [32031.768007] ? process_one_work+0x390/0x390 [32031.768265] kthread+0x10d/0x130 [32031.768521] ? kthread_flush_work_fn+0x10/0x10 [32031.768778] ret_from_fork+0x35/0x40 static int start_this_handle(...) BUG_ON(journal->j_flags & JBD2_UNMOUNT); <---- Trigger this Besides, after we enable fast commit, ext4_fc_replay can add work to s_error_work but return success, so the latter journal destroy in ext4_load_journal can trigger this problem too. Fix this problem with two steps: 1. Call ext4_commit_super directly in ext4_handle_error for the case that called from ext4_fc_replay 2. Since it's hard to pair the init and flush for s_error_work, we'd better add a extras flush_work before journal destroy in ext4_fill_super Besides, this patch will call ext4_commit_super in ext4_handle_error for any nojournal case too. But it seems safe since the reason we call schedule_work was that we should save error info to sb through journal if available. Conversely, for the nojournal case, it seems useless delay commit superblock to s_error_work. Fixes: c92dc856848f ("ext4: defer saving error info from atomic context") Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available") Cc: stable@kernel.org Signed-off-by: yangerkun Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210924093917.1953239-1-yangerkun@huawei.com Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dd0846321c2f..16e228285967 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -661,7 +661,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ - if (continue_fs) + if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_error_work); else ext4_commit_super(sb); @@ -5143,12 +5143,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: + /* flush s_error_work before sbi destroy */ flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); -- Gitee From d6ccfdbf351b28338ae9df32649030643d2110cf Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:35:45 +0800 Subject: [PATCH 026/134] block: don't call rq_qos_ops->done_bio if the bio isn't tracked mainline inclusion from mainline-v5.15-rc3 commit a647a524a46736786c95cdb553a070322ca096e3 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a647a524a46736786c95cdb553a070322ca096e3 Signed-off-by: Yu Changchun --------------------------- rq_qos framework is only applied on request based driver, so: 1) rq_qos_done_bio() needn't to be called for bio based driver 2) rq_qos_done_bio() needn't to be called for bio which isn't tracked, such as bios ended from error handling code. Especially in bio_endio(): 1) request queue is referred via bio->bi_bdev->bd_disk->queue, which may be gone since request queue refcount may not be held in above two cases 2) q->rq_qos may be freed in blk_cleanup_queue() when calling into __rq_qos_done_bio() Fix the potential kernel panic by not calling rq_qos_ops->done_bio if the bio isn't tracked. This way is safe because both ioc_rqos_done_bio() and blkcg_iolatency_done_bio() are nop if the bio isn't tracked. Reported-by: Yu Kuai Cc: tj@kernel.org Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210924110704.1541818-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Conflict: block/bio.c Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 0703a208ca24..b2e0f0c94c5f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1433,7 +1433,7 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) + if (bio->bi_disk && bio_flagged(bio, BIO_TRACKED)) rq_qos_done_bio(bio->bi_disk->queue, bio); /* -- Gitee From 2dc2ee15e2b4a77c8717a24ebbbe284f9e24a44f Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Mon, 15 Nov 2021 19:35:47 +0800 Subject: [PATCH 027/134] block: fix UAF from race of ioc_release_fn() and __ioc_clear_queue() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun -------------------------- KASAN reports a use-after-free report when doing block test: [293762.535116] ================================================================== [293762.535129] BUG: KASAN: use-after-free in queued_spin_lock_slowpath+0x78/0x4c8 [293762.535133] Write of size 2 at addr ffff8000d5f12bc8 by task sh/9148 [293762.535135] [293762.535145] CPU: 1 PID: 9148 Comm: sh Kdump: loaded Tainted: G W 4.19.90-vhulk2108.6.0.h824.kasan.eulerosv2r10.aarch64 #1 [293762.535148] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [293762.535150] Call trace: [293762.535154] dump_backtrace+0x0/0x310 [293762.535158] show_stack+0x28/0x38 [293762.535165] dump_stack+0xec/0x15c [293762.535172] print_address_description+0x68/0x2d0 [293762.535177] kasan_report+0x130/0x2f0 [293762.535182] __asan_store2+0x80/0xa8 [293762.535189] queued_spin_lock_slowpath+0x78/0x4c8 [293762.535194] __ioc_clear_queue+0x158/0x160 [293762.535198] ioc_clear_queue+0x194/0x258 [293762.535202] elevator_switch_mq+0x64/0x170 [293762.535206] elevator_switch+0x140/0x270 [293762.535211] elv_iosched_store+0x1a4/0x2a0 [293762.535215] queue_attr_store+0x90/0xe0 [293762.535219] sysfs_kf_write+0xa8/0xe8 [293762.535222] kernfs_fop_write+0x1f8/0x378 [293762.535227] __vfs_write+0xe0/0x360 [293762.535233] vfs_write+0xf0/0x270 [293762.535237] ksys_write+0xdc/0x1b8 [293762.535241] __arm64_sys_write+0x50/0x60 [293762.535245] el0_svc_common+0xc8/0x320 [293762.535250] el0_svc_handler+0xf8/0x160 [293762.535253] el0_svc+0x10/0x218 [293762.535254] [293762.535258] Allocated by task 3466763: [293762.535264] kasan_kmalloc+0xe0/0x190 [293762.535269] kasan_slab_alloc+0x14/0x20 [293762.535276] kmem_cache_alloc_node+0x1b4/0x420 [293762.535280] create_task_io_context+0x40/0x210 [293762.535284] generic_make_request_checks+0xc78/0xe38 [293762.535288] generic_make_request+0xf8/0x640 [293762.535394] generic_file_direct_write+0x100/0x268 [293762.535401] __generic_file_write_iter+0x128/0x370 [293762.535467] vfs_iter_write+0x64/0x90 [293762.535489] ovl_write_iter+0x2f8/0x458 [overlay] [293762.535493] __vfs_write+0x264/0x360 [293762.535497] vfs_write+0xf0/0x270 [293762.535501] ksys_write+0xdc/0x1b8 [293762.535505] __arm64_sys_write+0x50/0x60 [293762.535509] el0_svc_common+0xc8/0x320 [293762.535387] ext4_direct_IO+0x3c8/0xe80 [ext4] [293762.535394] generic_file_direct_write+0x100/0x268 [293762.535401] __generic_file_write_iter+0x128/0x370 [293762.535452] ext4_file_write_iter+0x610/0xa80 [ext4] [293762.535457] do_iter_readv_writev+0x28c/0x390 [293762.535463] do_iter_write+0xfc/0x360 [293762.535467] vfs_iter_write+0x64/0x90 [293762.535489] ovl_write_iter+0x2f8/0x458 [overlay] [293762.535493] __vfs_write+0x264/0x360 [293762.535497] vfs_write+0xf0/0x270 [293762.535501] ksys_write+0xdc/0x1b8 [293762.535505] __arm64_sys_write+0x50/0x60 [293762.535509] el0_svc_common+0xc8/0x320 [293762.535513] el0_svc_handler+0xf8/0x160 [293762.535517] el0_svc+0x10/0x218 [293762.535521] [293762.535523] Freed by task 3466763: [293762.535528] __kasan_slab_free+0x120/0x228 [293762.535532] kasan_slab_free+0x10/0x18 [293762.535536] kmem_cache_free+0x68/0x248 [293762.535540] put_io_context+0x104/0x190 [293762.535545] put_io_context_active+0x204/0x2c8 [293762.535549] exit_io_context+0x74/0xa0 [293762.535553] do_exit+0x658/0xae0 [293762.535557] do_group_exit+0x74/0x1a8 [293762.535561] get_signal+0x21c/0xf38 [293762.535564] do_signal+0x10c/0x450 [293762.535568] do_notify_resume+0x224/0x4b0 [293762.535573] work_pending+0x8/0x10 [293762.535574] [293762.535578] The buggy address belongs to the object at ffff8000d5f12bb8 which belongs to the cache blkdev_ioc of size 136 [293762.535582] The buggy address is located 16 bytes inside of 136-byte region [ffff8000d5f12bb8, ffff8000d5f12c40) [293762.535583] The buggy address belongs to the page: [293762.535588] page:ffff7e000357c480 count:1 mapcount:0 mapping:ffff8000d8563c00 index:0x0 [293762.536201] flags: 0x7ffff0000000100(slab) [293762.536540] raw: 07ffff0000000100 ffff7e0003118588 ffff8000d8adb530 ffff8000d8563c00 [293762.536546] raw: 0000000000000000 0000000000140014 00000001ffffffff 0000000000000000 [293762.536551] page dumped because: kasan: bad access detected [293762.536552] [293762.536554] Memory state around the buggy address: [293762.536558] ffff8000d5f12a80: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fb fb [293762.536562] ffff8000d5f12b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc [293762.536566] >ffff8000d5f12b80: fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb fb [293762.536568] ^ [293762.536572] ffff8000d5f12c00: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [293762.536576] ffff8000d5f12c80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [293762.536577] ================================================================== ioc_release_fn() will destroy icq from ioc->icq_list and __ioc_clear_queue() will destroy icq from request_queue->icq_list. However, the ioc_release_fn() will hold ioc_lock firstly, and free ioc finally. Then __ioc_clear_queue() will get ioc from icq and hold ioc_lock, but ioc has been released, which will result in a use-after-free. CPU0 CPU1 put_io_context elevator_switch_mq queue_work &ioc->release_work ioc_clear_queue ^^^ splice q->icq_list __ioc_clear_queue ^^^get icq from icq_list get ioc from icq->ioc ioc_release_fn spin_lock(ioc->lock) ioc_destroy_icq(icq) spin_unlock(ioc->lock) free(ioc) spin_lock(ioc->lock) <= UAF Fix by grabbing the request_queue->queue_lock in ioc_clear_queue() to avoid this race scene. Signed-off-by: Laibin Qiu Link: https://lore.kernel.org/lkml/1c9ad9f2-c487-c793-1ffc-5c3ec0fcc0ae@kernel.dk/ Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-ioc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 57299f860d41..1d6ba8ff5a66 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -242,9 +242,9 @@ void ioc_clear_queue(struct request_queue *q) spin_lock_irq(&q->queue_lock); list_splice_init(&q->icq_list, &icq_list); - spin_unlock_irq(&q->queue_lock); __ioc_clear_queue(&icq_list); + spin_unlock_irq(&q->queue_lock); } int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) -- Gitee From c516b426c31a1d50dddf4b3a6e1c924734897672 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 15 Nov 2021 19:35:48 +0800 Subject: [PATCH 028/134] nbd: add the check to prevent overflow in __nbd_ioctl() mainline inclusion from mainline-5.14 commit fad7cd3310db3099f95dd34312c77740fbc455e5 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fad7cd3310db3099f95dd34312c77740fbc455e5 Signed-off-by: Yu Changchun ------------------------------------------------- If user specify a large enough value of NBD blocks option, it may trigger signed integer overflow which may lead to nbd->config->bytesize becomes a large or small value, zero in particular. UBSAN: Undefined behaviour in drivers/block/nbd.c:325:31 signed integer overflow: 1024 * 4611686155866341414 cannot be represented in type 'long long int' [...] Call trace: [...] handle_overflow+0x188/0x1dc lib/ubsan.c:192 __ubsan_handle_mul_overflow+0x34/0x44 lib/ubsan.c:213 nbd_size_set drivers/block/nbd.c:325 [inline] __nbd_ioctl drivers/block/nbd.c:1342 [inline] nbd_ioctl+0x998/0xa10 drivers/block/nbd.c:1395 __blkdev_driver_ioctl block/ioctl.c:311 [inline] [...] Although it is not a big deal, still silence the UBSAN by limit the input value. Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210804021212.990223-1-libaokun1@huawei.com [axboe: dropped unlikely()] Signed-off-by: Jens Axboe Conflicts: drivers/block/nbd.c Signed-off-by: Baokun Li Reviewed-by: Hou Tao Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f7f1d9dbdc80..f4b0637bb155 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1385,6 +1385,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { struct nbd_config *config = nbd->config; + loff_t bytesize; switch (cmd) { case NBD_DISCONNECT: @@ -1407,6 +1408,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, div_s64(arg, config->blksize)); return 0; case NBD_SET_SIZE_BLOCKS: + if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize)) + return -EINVAL; nbd_size_set(nbd, config->blksize, arg); return 0; case NBD_SET_TIMEOUT: -- Gitee From db88ad78a216ba7092b1ae3584fb28dbbb16b1da Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Nov 2021 19:35:49 +0800 Subject: [PATCH 029/134] sched/topology: Warn when NUMA diameter > 2 mainline inclusion from mainline-v5.11-rc1 commit b5b217346de85ed1b03fdecd5c5076b34fbb2f0b category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b5b217346de85ed1b03fdecd5c5076b34fbb2f0b Signed-off-by: Yu Changchun ---------------------------------------------------------- NUMA topologies where the shortest path between some two nodes requires three or more hops (i.e. diameter > 2) end up being misrepresented in the scheduler topology structures. This is currently detected when booting a kernel with CONFIG_SCHED_DEBUG=y + sched_debug on the cmdline, although this will only yield a warning about sched_group spans not matching sched_domain spans: ERROR: groups don't span domain->span Add an explicit warning for that case, triggered regardless of CONFIG_SCHED_DEBUG, and decorate it with an appropriate comment. The topology described in the comment can be booted up on QEMU by appending the following to your usual QEMU incantation: -smp cores=4 \ -numa node,cpus=0,nodeid=0 -numa node,cpus=1,nodeid=1, \ -numa node,cpus=2,nodeid=2, -numa node,cpus=3,nodeid=3, \ -numa dist,src=0,dst=1,val=20, -numa dist,src=0,dst=2,val=30, \ -numa dist,src=0,dst=3,val=40, -numa dist,src=1,dst=2,val=20, \ -numa dist,src=1,dst=3,val=30, -numa dist,src=2,dst=3,val=20 A somewhat more realistic topology (6-node mesh) with the same affliction can be conjured with: -smp cores=6 \ -numa node,cpus=0,nodeid=0 -numa node,cpus=1,nodeid=1, \ -numa node,cpus=2,nodeid=2, -numa node,cpus=3,nodeid=3, \ -numa node,cpus=4,nodeid=4, -numa node,cpus=5,nodeid=5, \ -numa dist,src=0,dst=1,val=20, -numa dist,src=0,dst=2,val=30, \ -numa dist,src=0,dst=3,val=40, -numa dist,src=0,dst=4,val=30, \ -numa dist,src=0,dst=5,val=20, \ -numa dist,src=1,dst=2,val=20, -numa dist,src=1,dst=3,val=30, \ -numa dist,src=1,dst=4,val=20, -numa dist,src=1,dst=5,val=30, \ -numa dist,src=2,dst=3,val=20, -numa dist,src=2,dst=4,val=30, \ -numa dist,src=2,dst=5,val=40, \ -numa dist,src=3,dst=4,val=20, -numa dist,src=3,dst=5,val=30, \ -numa dist,src=4,dst=5,val=20 Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/lkml/jhjtux5edo2.mognet@arm.com Signed-off-by: Jialin Zhang Reviewed-by: Cheng Jian Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dd7770226086..796f6c62e59a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -674,6 +674,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + int numa_distance = 0; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -705,6 +706,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + for (tmp = sd; tmp; tmp = tmp->parent) + numa_distance += !!(tmp->flags & SD_NUMA); + + /* + * FIXME: Diameter >=3 is misrepresented. + * + * Smallest diameter=3 topology is: + * + * node 0 1 2 3 + * 0: 10 20 30 40 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 40 30 20 10 + * + * 0 --- 1 --- 2 --- 3 + * + * NUMA-3 0-3 N/A N/A 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-2 0-2 0-3 0-3 1-3 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} + * + * NUMA-1 0-1 0-2 1-3 2-3 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} + * + * NUMA-0 0 1 2 3 + * + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the + * group span isn't a subset of the domain span. + */ + WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n"); + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); -- Gitee From 55aaafa474e541bd879834cd2128d73a99d5f1ba Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 15 Nov 2021 19:35:50 +0800 Subject: [PATCH 030/134] sched/topology: fix the issue groups don't span domain->span for NUMA diameter > 2 mainline inclusion from mainline-v5.13-rc1 commit 585b6d2723dc927ebc4ad884c4e879e4da8bc21f category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=585b6d2723dc927ebc4ad884c4e879e4da8bc21f Signed-off-by: Yu Changchun ---------------------------------------------------------- As long as NUMA diameter > 2, building sched_domain by sibling's child domain will definitely create a sched_domain with sched_group which will span out of the sched_domain: +------+ +------+ +-------+ +------+ | node | 12 |node | 20 | node | 12 |node | | 0 +---------+1 +--------+ 2 +-------+3 | +------+ +------+ +-------+ +------+ domain0 node0 node1 node2 node3 domain1 node0+1 node0+1 node2+3 node2+3 + domain2 node0+1+2 | group: node0+1 | group:node2+3 <-------------------+ when node2 is added into the domain2 of node0, kernel is using the child domain of node2's domain2, which is domain1(node2+3). Node 3 is outside the span of the domain including node0+1+2. This will make load_balance() run based on screwed avg_load and group_type in the sched_group spanning out of the sched_domain, and it also makes select_task_rq_fair() pick an idle CPU outside the sched_domain. Real servers which suffer from this problem include Kunpeng920 and 8-node Sun Fire X4600-M2, at least. Here we move to use the *child* domain of the *child* domain of node2's domain2 as the new added sched_group. At the same, we re-use the lower level sgc directly. +------+ +------+ +-------+ +------+ | node | 12 |node | 20 | node | 12 |node | | 0 +---------+1 +--------+ 2 +-------+3 | +------+ +------+ +-------+ +------+ domain0 node0 node1 +- node2 node3 | domain1 node0+1 node0+1 | node2+3 node2+3 | domain2 node0+1+2 | group: node0+1 | group:node2 <-------------------+ While the lower level sgc is re-used, this patch only changes the remote sched_groups for those sched_domains playing grandchild trick, therefore, sgc->next_update is still safe since it's only touched by CPUs that have the group span as local group. And sgc->imbalance is also safe because sd_parent remains the same in load_balance and LB only tries other CPUs from the local group. Moreover, since local groups are not touched, they are still getting roughly equal size in a TL. And should_we_balance() only matters with local groups, so the pull probability of those groups are still roughly equal. Tested by the below topology: qemu-system-aarch64 -M virt -nographic \ -smp cpus=8 \ -numa node,cpus=0-1,nodeid=0 \ -numa node,cpus=2-3,nodeid=1 \ -numa node,cpus=4-5,nodeid=2 \ -numa node,cpus=6-7,nodeid=3 \ -numa dist,src=0,dst=1,val=12 \ -numa dist,src=0,dst=2,val=20 \ -numa dist,src=0,dst=3,val=22 \ -numa dist,src=1,dst=2,val=22 \ -numa dist,src=2,dst=3,val=12 \ -numa dist,src=1,dst=3,val=24 \ -m 4G -cpu cortex-a57 -kernel arch/arm64/boot/Image w/o patch, we get lots of "groups don't span domain->span": [ 0.802139] CPU0 attaching sched-domain(s): [ 0.802193] domain-0: span=0-1 level=MC [ 0.802443] groups: 0:{ span=0 cap=1013 }, 1:{ span=1 cap=979 } [ 0.802693] domain-1: span=0-3 level=NUMA [ 0.802731] groups: 0:{ span=0-1 cap=1992 }, 2:{ span=2-3 cap=1943 } [ 0.802811] domain-2: span=0-5 level=NUMA [ 0.802829] groups: 0:{ span=0-3 cap=3935 }, 4:{ span=4-7 cap=3937 } [ 0.802881] ERROR: groups don't span domain->span [ 0.803058] domain-3: span=0-7 level=NUMA [ 0.803080] groups: 0:{ span=0-5 mask=0-1 cap=5843 }, 6:{ span=4-7 mask=6-7 cap=4077 } [ 0.804055] CPU1 attaching sched-domain(s): [ 0.804072] domain-0: span=0-1 level=MC [ 0.804096] groups: 1:{ span=1 cap=979 }, 0:{ span=0 cap=1013 } [ 0.804152] domain-1: span=0-3 level=NUMA [ 0.804170] groups: 0:{ span=0-1 cap=1992 }, 2:{ span=2-3 cap=1943 } [ 0.804219] domain-2: span=0-5 level=NUMA [ 0.804236] groups: 0:{ span=0-3 cap=3935 }, 4:{ span=4-7 cap=3937 } [ 0.804302] ERROR: groups don't span domain->span [ 0.804520] domain-3: span=0-7 level=NUMA [ 0.804546] groups: 0:{ span=0-5 mask=0-1 cap=5843 }, 6:{ span=4-7 mask=6-7 cap=4077 } [ 0.804677] CPU2 attaching sched-domain(s): [ 0.804687] domain-0: span=2-3 level=MC [ 0.804705] groups: 2:{ span=2 cap=934 }, 3:{ span=3 cap=1009 } [ 0.804754] domain-1: span=0-3 level=NUMA [ 0.804772] groups: 2:{ span=2-3 cap=1943 }, 0:{ span=0-1 cap=1992 } [ 0.804820] domain-2: span=0-5 level=NUMA [ 0.804836] groups: 2:{ span=0-3 mask=2-3 cap=3991 }, 4:{ span=0-1,4-7 mask=4-5 cap=5985 } [ 0.804944] ERROR: groups don't span domain->span [ 0.805108] domain-3: span=0-7 level=NUMA [ 0.805134] groups: 2:{ span=0-5 mask=2-3 cap=5899 }, 6:{ span=0-1,4-7 mask=6-7 cap=6125 } [ 0.805223] CPU3 attaching sched-domain(s): [ 0.805232] domain-0: span=2-3 level=MC [ 0.805249] groups: 3:{ span=3 cap=1009 }, 2:{ span=2 cap=934 } [ 0.805319] domain-1: span=0-3 level=NUMA [ 0.805336] groups: 2:{ span=2-3 cap=1943 }, 0:{ span=0-1 cap=1992 } [ 0.805383] domain-2: span=0-5 level=NUMA [ 0.805399] groups: 2:{ span=0-3 mask=2-3 cap=3991 }, 4:{ span=0-1,4-7 mask=4-5 cap=5985 } [ 0.805458] ERROR: groups don't span domain->span [ 0.805605] domain-3: span=0-7 level=NUMA [ 0.805626] groups: 2:{ span=0-5 mask=2-3 cap=5899 }, 6:{ span=0-1,4-7 mask=6-7 cap=6125 } [ 0.805712] CPU4 attaching sched-domain(s): [ 0.805721] domain-0: span=4-5 level=MC [ 0.805738] groups: 4:{ span=4 cap=984 }, 5:{ span=5 cap=924 } [ 0.805787] domain-1: span=4-7 level=NUMA [ 0.805803] groups: 4:{ span=4-5 cap=1908 }, 6:{ span=6-7 cap=2029 } [ 0.805851] domain-2: span=0-1,4-7 level=NUMA [ 0.805867] groups: 4:{ span=4-7 cap=3937 }, 0:{ span=0-3 cap=3935 } [ 0.805915] ERROR: groups don't span domain->span [ 0.806108] domain-3: span=0-7 level=NUMA [ 0.806130] groups: 4:{ span=0-1,4-7 mask=4-5 cap=5985 }, 2:{ span=0-3 mask=2-3 cap=3991 } [ 0.806214] CPU5 attaching sched-domain(s): [ 0.806222] domain-0: span=4-5 level=MC [ 0.806240] groups: 5:{ span=5 cap=924 }, 4:{ span=4 cap=984 } [ 0.806841] domain-1: span=4-7 level=NUMA [ 0.806866] groups: 4:{ span=4-5 cap=1908 }, 6:{ span=6-7 cap=2029 } [ 0.806934] domain-2: span=0-1,4-7 level=NUMA [ 0.806953] groups: 4:{ span=4-7 cap=3937 }, 0:{ span=0-3 cap=3935 } [ 0.807004] ERROR: groups don't span domain->span [ 0.807312] domain-3: span=0-7 level=NUMA [ 0.807386] groups: 4:{ span=0-1,4-7 mask=4-5 cap=5985 }, 2:{ span=0-3 mask=2-3 cap=3991 } [ 0.807686] CPU6 attaching sched-domain(s): [ 0.807710] domain-0: span=6-7 level=MC [ 0.807750] groups: 6:{ span=6 cap=1017 }, 7:{ span=7 cap=1012 } [ 0.807840] domain-1: span=4-7 level=NUMA [ 0.807870] groups: 6:{ span=6-7 cap=2029 }, 4:{ span=4-5 cap=1908 } [ 0.807952] domain-2: span=0-1,4-7 level=NUMA [ 0.807985] groups: 6:{ span=4-7 mask=6-7 cap=4077 }, 0:{ span=0-5 mask=0-1 cap=5843 } [ 0.808045] ERROR: groups don't span domain->span [ 0.808257] domain-3: span=0-7 level=NUMA [ 0.808571] groups: 6:{ span=0-1,4-7 mask=6-7 cap=6125 }, 2:{ span=0-5 mask=2-3 cap=5899 } [ 0.808848] CPU7 attaching sched-domain(s): [ 0.808860] domain-0: span=6-7 level=MC [ 0.808880] groups: 7:{ span=7 cap=1012 }, 6:{ span=6 cap=1017 } [ 0.808953] domain-1: span=4-7 level=NUMA [ 0.808974] groups: 6:{ span=6-7 cap=2029 }, 4:{ span=4-5 cap=1908 } [ 0.809034] domain-2: span=0-1,4-7 level=NUMA [ 0.809055] groups: 6:{ span=4-7 mask=6-7 cap=4077 }, 0:{ span=0-5 mask=0-1 cap=5843 } [ 0.809128] ERROR: groups don't span domain->span [ 0.810361] domain-3: span=0-7 level=NUMA [ 0.810400] groups: 6:{ span=0-1,4-7 mask=6-7 cap=5961 }, 2:{ span=0-5 mask=2-3 cap=5903 } w/ patch, we don't get "groups don't span domain->span" any more: [ 1.486271] CPU0 attaching sched-domain(s): [ 1.486820] domain-0: span=0-1 level=MC [ 1.500924] groups: 0:{ span=0 cap=980 }, 1:{ span=1 cap=994 } [ 1.515717] domain-1: span=0-3 level=NUMA [ 1.515903] groups: 0:{ span=0-1 cap=1974 }, 2:{ span=2-3 cap=1989 } [ 1.516989] domain-2: span=0-5 level=NUMA [ 1.517124] groups: 0:{ span=0-3 cap=3963 }, 4:{ span=4-5 cap=1949 } [ 1.517369] domain-3: span=0-7 level=NUMA [ 1.517423] groups: 0:{ span=0-5 mask=0-1 cap=5912 }, 6:{ span=4-7 mask=6-7 cap=4054 } [ 1.520027] CPU1 attaching sched-domain(s): [ 1.520097] domain-0: span=0-1 level=MC [ 1.520184] groups: 1:{ span=1 cap=994 }, 0:{ span=0 cap=980 } [ 1.520429] domain-1: span=0-3 level=NUMA [ 1.520487] groups: 0:{ span=0-1 cap=1974 }, 2:{ span=2-3 cap=1989 } [ 1.520687] domain-2: span=0-5 level=NUMA [ 1.520744] groups: 0:{ span=0-3 cap=3963 }, 4:{ span=4-5 cap=1949 } [ 1.520948] domain-3: span=0-7 level=NUMA [ 1.521038] groups: 0:{ span=0-5 mask=0-1 cap=5912 }, 6:{ span=4-7 mask=6-7 cap=4054 } [ 1.522068] CPU2 attaching sched-domain(s): [ 1.522348] domain-0: span=2-3 level=MC [ 1.522606] groups: 2:{ span=2 cap=1003 }, 3:{ span=3 cap=986 } [ 1.522832] domain-1: span=0-3 level=NUMA [ 1.522885] groups: 2:{ span=2-3 cap=1989 }, 0:{ span=0-1 cap=1974 } [ 1.523043] domain-2: span=0-5 level=NUMA [ 1.523092] groups: 2:{ span=0-3 mask=2-3 cap=4037 }, 4:{ span=4-5 cap=1949 } [ 1.523302] domain-3: span=0-7 level=NUMA [ 1.523352] groups: 2:{ span=0-5 mask=2-3 cap=5986 }, 6:{ span=0-1,4-7 mask=6-7 cap=6102 } [ 1.523748] CPU3 attaching sched-domain(s): [ 1.523774] domain-0: span=2-3 level=MC [ 1.523825] groups: 3:{ span=3 cap=986 }, 2:{ span=2 cap=1003 } [ 1.524009] domain-1: span=0-3 level=NUMA [ 1.524086] groups: 2:{ span=2-3 cap=1989 }, 0:{ span=0-1 cap=1974 } [ 1.524281] domain-2: span=0-5 level=NUMA [ 1.524331] groups: 2:{ span=0-3 mask=2-3 cap=4037 }, 4:{ span=4-5 cap=1949 } [ 1.524534] domain-3: span=0-7 level=NUMA [ 1.524586] groups: 2:{ span=0-5 mask=2-3 cap=5986 }, 6:{ span=0-1,4-7 mask=6-7 cap=6102 } [ 1.524847] CPU4 attaching sched-domain(s): [ 1.524873] domain-0: span=4-5 level=MC [ 1.524954] groups: 4:{ span=4 cap=958 }, 5:{ span=5 cap=991 } [ 1.525105] domain-1: span=4-7 level=NUMA [ 1.525153] groups: 4:{ span=4-5 cap=1949 }, 6:{ span=6-7 cap=2006 } [ 1.525368] domain-2: span=0-1,4-7 level=NUMA [ 1.525428] groups: 4:{ span=4-7 cap=3955 }, 0:{ span=0-1 cap=1974 } [ 1.532726] domain-3: span=0-7 level=NUMA [ 1.532811] groups: 4:{ span=0-1,4-7 mask=4-5 cap=6003 }, 2:{ span=0-3 mask=2-3 cap=4037 } [ 1.534125] CPU5 attaching sched-domain(s): [ 1.534159] domain-0: span=4-5 level=MC [ 1.534303] groups: 5:{ span=5 cap=991 }, 4:{ span=4 cap=958 } [ 1.534490] domain-1: span=4-7 level=NUMA [ 1.534572] groups: 4:{ span=4-5 cap=1949 }, 6:{ span=6-7 cap=2006 } [ 1.534734] domain-2: span=0-1,4-7 level=NUMA [ 1.534783] groups: 4:{ span=4-7 cap=3955 }, 0:{ span=0-1 cap=1974 } [ 1.536057] domain-3: span=0-7 level=NUMA [ 1.536430] groups: 4:{ span=0-1,4-7 mask=4-5 cap=6003 }, 2:{ span=0-3 mask=2-3 cap=3896 } [ 1.536815] CPU6 attaching sched-domain(s): [ 1.536846] domain-0: span=6-7 level=MC [ 1.536934] groups: 6:{ span=6 cap=1005 }, 7:{ span=7 cap=1001 } [ 1.537144] domain-1: span=4-7 level=NUMA [ 1.537262] groups: 6:{ span=6-7 cap=2006 }, 4:{ span=4-5 cap=1949 } [ 1.537553] domain-2: span=0-1,4-7 level=NUMA [ 1.537613] groups: 6:{ span=4-7 mask=6-7 cap=4054 }, 0:{ span=0-1 cap=1805 } [ 1.537872] domain-3: span=0-7 level=NUMA [ 1.537998] groups: 6:{ span=0-1,4-7 mask=6-7 cap=6102 }, 2:{ span=0-5 mask=2-3 cap=5845 } [ 1.538448] CPU7 attaching sched-domain(s): [ 1.538505] domain-0: span=6-7 level=MC [ 1.538586] groups: 7:{ span=7 cap=1001 }, 6:{ span=6 cap=1005 } [ 1.538746] domain-1: span=4-7 level=NUMA [ 1.538798] groups: 6:{ span=6-7 cap=2006 }, 4:{ span=4-5 cap=1949 } [ 1.539048] domain-2: span=0-1,4-7 level=NUMA [ 1.539111] groups: 6:{ span=4-7 mask=6-7 cap=4054 }, 0:{ span=0-1 cap=1805 } [ 1.539571] domain-3: span=0-7 level=NUMA [ 1.539610] groups: 6:{ span=0-1,4-7 mask=6-7 cap=6102 }, 2:{ span=0-5 mask=2-3 cap=5845 } Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Tested-by: Meelis Roos Link: https://lkml.kernel.org/r/20210224030944.15232-1-song.bao.hua@hisilicon.com Signed-off-by: Jialin Zhang Reviewed-by: Cheng Jian Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/sched/topology.c | 91 +++++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 30 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 796f6c62e59a..afe95ede9861 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -709,35 +709,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) for (tmp = sd; tmp; tmp = tmp->parent) numa_distance += !!(tmp->flags & SD_NUMA); - /* - * FIXME: Diameter >=3 is misrepresented. - * - * Smallest diameter=3 topology is: - * - * node 0 1 2 3 - * 0: 10 20 30 40 - * 1: 20 10 20 30 - * 2: 30 20 10 20 - * 3: 40 30 20 10 - * - * 0 --- 1 --- 2 --- 3 - * - * NUMA-3 0-3 N/A N/A 0-3 - * groups: {0-2},{1-3} {1-3},{0-2} - * - * NUMA-2 0-2 0-3 0-3 1-3 - * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} - * - * NUMA-1 0-1 0-2 1-3 2-3 - * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} - * - * NUMA-0 0 1 2 3 - * - * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the - * group span isn't a subset of the domain span. - */ - WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n"); - sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); @@ -968,6 +939,31 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } +static struct sched_domain * +find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) +{ + /* + * The proper descendant would be the one whose child won't span out + * of sd + */ + while (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), + sched_domain_span(sd))) + sibling = sibling->child; + + /* + * As we are referencing sgc across different topology level, we need + * to go down to skip those sched_domains which don't contribute to + * scheduling because they will be degenerated in cpu_attach_domain + */ + while (sibling->child && + cpumask_equal(sched_domain_span(sibling->child), + sched_domain_span(sibling))) + sibling = sibling->child; + + return sibling; +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -1001,6 +997,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; + /* + * Usually we build sched_group by sibling's child sched_domain + * But for machines whose NUMA diameter are 3 or above, we move + * to build sched_group by sibling's proper descendant's child + * domain because sibling's child sched_domain will span out of + * the sched_domain being built as below. + * + * Smallest diameter=3 topology is: + * + * node 0 1 2 3 + * 0: 10 20 30 40 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 40 30 20 10 + * + * 0 --- 1 --- 2 --- 3 + * + * NUMA-3 0-3 N/A N/A 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-2 0-2 0-3 0-3 1-3 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} + * + * NUMA-1 0-1 0-2 1-3 2-3 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} + * + * NUMA-0 0 1 2 3 + * + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the + * group span isn't a subset of the domain span. + */ + if (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), span)) + sibling = find_descended_sibling(sd, sibling); + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; @@ -1008,7 +1039,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - init_overlap_sched_group(sd, sg); + init_overlap_sched_group(sibling, sg); if (!first) first = sg; -- Gitee From f0fb7c6695a9bd98aa6a08fd0269206472c0aa0d Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Nov 2021 19:35:51 +0800 Subject: [PATCH 031/134] sched/topology: Make sched_init_numa() use a set for the deduplicating sort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.12-rc1 commit 620a6dc40754dc218f5b6389b5d335e9a107fd29 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=620a6dc40754dc218f5b6389b5d335e9a107fd29 Signed-off-by: Yu Changchun ---------------------------------------------------------- The deduplicating sort in sched_init_numa() assumes that the first line in the distance table contains all unique values in the entire table. I've been trying to pen what this exactly means for the topology, but it's not straightforward. For instance, topology.c uses this example: node 0 1 2 3 0: 10 20 20 30 1: 20 10 20 20 2: 20 20 10 20 3: 30 20 20 10 0 ----- 1 | / | | / | | / | 2 ----- 3 Which works out just fine. However, if we swap nodes 0 and 1: 1 ----- 0 | / | | / | | / | 2 ----- 3 we get this distance table: node 0 1 2 3 0: 10 20 20 20 1: 20 10 20 30 2: 20 20 10 20 3: 20 30 20 10 Which breaks the deduplicating sort (non-representative first line). In this case this would just be a renumbering exercise, but it so happens that we can have a deduplicating sort that goes through the whole table in O(n²) at the extra cost of a temporary memory allocation (i.e. any form of set). The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following this, implement the set as a 256-bits bitmap. Should this not be satisfactory (i.e. we want to support 32-bit values), then we'll have to go for some other sparse set implementation. This has the added benefit of letting us allocate just the right amount of memory for sched_domains_numa_distance[], rather than an arbitrary (nr_node_ids + 1). Note: DT binding equivalent (distance-map) decodes distances as 32-bit values. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210122123943.1217-2-valentin.schneider@arm.com Signed-off-by: Jialin Zhang Reviewed-by: Cheng Jian Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/topology.h | 1 + kernel/sched/topology.c | 98 +++++++++++++++++++--------------------- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/include/linux/topology.h b/include/linux/topology.h index ad03df1cc266..7634cd737061 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -48,6 +48,7 @@ int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 +#define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index afe95ede9861..84cf39842efe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1613,66 +1613,57 @@ static void init_numa_topology_type(void) } } +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) + void sched_init_numa(void) { - int next_distance, curr_distance = node_distance(0, 0); struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; + + bitmap_zero(distance_map, NR_DISTANCE_VALUES); for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + int distance = node_distance(i, j); - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); - - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!sched_domains_numa_distance) { + bitmap_free(distance_map); + return; + } + + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + sched_domains_numa_distance[i] = j; } + bitmap_free(distance_map); + /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1681,15 +1672,15 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -1697,7 +1688,7 @@ void sched_init_numa(void) * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { + for (i = 0; i < nr_levels; i++) { sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); if (!sched_domains_numa_masks[i]) @@ -1705,12 +1696,17 @@ void sched_init_numa(void) for (j = 0; j < nr_node_ids; j++) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); + if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1722,7 +1718,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1745,7 +1741,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1757,8 +1753,8 @@ void sched_init_numa(void) sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); } -- Gitee From e29a80f77a3fbcb052939eae904ed48ca0a786e7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Nov 2021 19:35:52 +0800 Subject: [PATCH 032/134] ia64: ensure proper NUMA distance and possible map initialization mainline inclusion from mainline-v5.13-rc1 commit b22a8f7b4bde4e4ab73b64908ffd5d90ecdcdbfd category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b22a8f7b4bde4e4ab73b64908ffd5d90ecdcdbfd Signed-off-by: Yu Changchun ---------------------------------------------------------- John Paul reported a warning about bogus NUMA distance values spurred by commit: 620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort") In this case, the afflicted machine comes up with a reported 256 possible nodes, all of which are 0 distance away from one another. This was previously silently ignored, but is now caught by the aforementioned commit. The culprit is ia64's node_possible_map which remains unchanged from its initialization value of NODE_MASK_ALL. In John's case, the machine doesn't have any SRAT nor SLIT table, but AIUI the possible map remains untouched regardless of what ACPI tables end up being parsed. Thus, !online && possible nodes remain with a bogus distance of 0 (distances \in [0, 9] are "reserved and have no meaning" as per the ACPI spec). Follow x86 / drivers/base/arch_numa's example and set the possible map to the parsed map, which in this case seems to be the online map. Link: http://lore.kernel.org/r/255d6b5d-194e-eb0e-ecdd-97477a534441@physik.fu-berlin.de Link: https://lkml.kernel.org/r/20210318130617.896309-1-valentin.schneider@arm.com Fixes: 620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort") Signed-off-by: Valentin Schneider Reported-by: John Paul Adrian Glaubitz Tested-by: John Paul Adrian Glaubitz Tested-by: Sergei Trofimovich Cc: "Peter Zijlstra (Intel)" Cc: Ingo Molnar Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Anatoly Pugachev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Jialin Zhang Reviewed-by: Cheng Jian Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- arch/ia64/kernel/acpi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index a5636524af76..e2af6b172200 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -446,7 +446,8 @@ void __init acpi_numa_fixup(void) if (srat_num_cpus == 0) { node_set_online(0); node_cpuid[0].phys_id = hard_smp_processor_id(); - return; + slit_distance(0, 0) = LOCAL_DISTANCE; + goto out; } /* @@ -489,7 +490,7 @@ void __init acpi_numa_fixup(void) for (j = 0; j < MAX_NUMNODES; j++) slit_distance(i, j) = i == j ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return; + goto out; } memset(numa_slit, -1, sizeof(numa_slit)); @@ -514,6 +515,8 @@ void __init acpi_numa_fixup(void) printk("\n"); } #endif +out: + node_possible_map = node_online_map; } #endif /* CONFIG_ACPI_NUMA */ -- Gitee From 501cfcd826bc0979d940d35886ff3031f9b6656e Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Mon, 15 Nov 2021 19:35:53 +0800 Subject: [PATCH 033/134] block: fix memory leak for mq shared sbitmap maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- If the blk_mq_sched_alloc_tags() -> blk_mq_alloc_rqs() call fails, then we call blk_mq_free_rq_map(). But if BLK_MQ_F_TAG_HCTX_SHARED has been set to hctx->flags, tags->bitmap_tags and tags->breserved_tags will not be released and cause a memory leak. Fixes: 31044c2e5b8de ("blk-mq-sched: Fix blk_mq_sched_alloc_tags() error handling") Signed-off-by: Laibin Qiu Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-mq-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 3fea30f0ea27..a3266541bd06 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -522,7 +522,7 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); if (ret) { - blk_mq_free_rq_map(hctx->sched_tags, set->flags); + blk_mq_free_rq_map(hctx->sched_tags, flags); hctx->sched_tags = NULL; } -- Gitee From e30e422336856649e063bec70b16edae0984c342 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 15 Nov 2021 19:35:55 +0800 Subject: [PATCH 034/134] fs: add vfs_parse_fs_param_source() helper mainline inclusion from mainline-5.14-rc2 commit d1d488d813703618f0dd93f0e4c4a05928114aa8 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d1d488d813703618f0dd93f0e4c4a05928114aa8 Signed-off-by: Yu Changchun --------------------------- Add a simple helper that filesystems can use in their parameter parser to parse the "source" parameter. A few places open-coded this function and that already caused a bug in the cgroup v1 parser that we fixed. Let's make it harder to get this wrong by introducing a helper which performs all necessary checks. Link: https://syzkaller.appspot.com/bug?id=6312526aba5beae046fdae8f00399f87aab48b12 Cc: Christoph Hellwig Cc: Alexander Viro Cc: Dmitry Vyukov Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds Conflicts: include/linux/fs_context.h Signed-off-by: yangerkun Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/fs_context.c | 54 +++++++++++++++++++++++++------------- include/linux/fs_context.h | 2 ++ kernel/cgroup/cgroup-v1.c | 14 ++++------ 3 files changed, 43 insertions(+), 27 deletions(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index 4858645ca620..b7e43a780a62 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -79,6 +79,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) return -ENOPARAM; } +/** + * vfs_parse_fs_param_source - Handle setting "source" via parameter + * @fc: The filesystem context to modify + * @param: The parameter + * + * This is a simple helper for filesystems to verify that the "source" they + * accept is sane. + * + * Returns 0 on success, -ENOPARAM if this is not "source" parameter, and + * -EINVAL otherwise. In the event of failure, supplementary error information + * is logged. + */ +int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param) +{ + if (strcmp(param->key, "source") != 0) + return -ENOPARAM; + + if (param->type != fs_value_is_string) + return invalf(fc, "Non-string source"); + + if (fc->source) + return invalf(fc, "Multiple sources"); + + fc->source = param->string; + param->string = NULL; + return 0; +} +EXPORT_SYMBOL(vfs_parse_fs_param_source); + /** * vfs_parse_fs_param - Add a single parameter to a superblock config * @fc: The filesystem context to modify @@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) /* If the filesystem doesn't take any arguments, give it the * default handling of source. */ - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; return invalf(fc, "%s: Unknown parameter '%s'", fc->fs_type->name, param->key); @@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) struct legacy_fs_context *ctx = fc->fs_private; unsigned int size = ctx->data_size; size_t len = 0; + int ret; - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Legacy: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 5b44b0195a28..6b54982fc5f3 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, extern int generic_parse_monolithic(struct fs_context *fc, void *data); extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); +extern int vfs_parse_fs_param_source(struct fs_context *fc, + struct fs_parameter *param); extern void fc_drop_locked(struct fs_context *fc); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 812dd4ed0129..88b72815aa9d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -915,15 +915,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "Non-string source"); - if (fc->source) - return invalf(fc, "Multiple sources not supported"); - fc->source = param->string; - param->string = NULL; - return 0; - } + int ret; + + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; -- Gitee From 848a2ff9d89acf7f7bc31e3c8ee418365d4accdc Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 15 Nov 2021 19:35:56 +0800 Subject: [PATCH 035/134] ramfs: fix mount source show for ramfs maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- ramfs_parse_param does not parse key "source", and will convert -ENOPARAM to 0. This will skip vfs_parse_fs_param_source in vfs_parse_fs_param, which lead always "none" mount source for ramfs. Fix it by parse "source" in ramfs_parse_param like cgroup1_parse_param has do. Link: https://lkml.kernel.org/r/20210924091756.1906118-1-yangerkun@huawei.com Signed-off-by: yangerkun Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ramfs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index ee179a81b3da..0163dcff9318 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -193,17 +193,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, ramfs_fs_parameters, param, &result); - if (opt < 0) { + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; /* * We might like to report bad mount options here; * but traditionally ramfs has ignored all mount options, * and as it is used as a !CONFIG_SHMEM simple substitute * for tmpfs, better continue to ignore other mount options. */ - if (opt == -ENOPARAM) - opt = 0; - return opt; + return 0; } + if (opt < 0) + return opt; switch (opt) { case Opt_mode: -- Gitee From 5e8716e337144cac62d2f2daa9be8b423c717b03 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 15 Nov 2021 19:36:00 +0800 Subject: [PATCH 036/134] ext4: factor out write end code of inline file mainline inclusion from mainline-5.15-rc4 commit 6984aef59814fb5c47b0e30c56e101186b5ebf8c category: perf issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6984aef59814fb5c47b0e30c56e101186b5ebf8c Signed-off-by: Yu Changchun --------------------------- Now that the inline_data file write end procedure are falled into the common write end functions, it is not clear. Factor them out and do some cleanup. This patch also drop ext4_da_write_inline_data_end() and switch to use ext4_write_inline_data_end() instead because we also need to do the same error processing if we failed to write data into inline entry. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210716122024.1105856-4-yi.zhang@huawei.com Conflicts: fs/ext4/inline.c Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/ext4.h | 3 -- fs/ext4/inline.c | 128 +++++++++++++++++++++++++---------------------- fs/ext4/inode.c | 73 +++++++++------------------ 3 files changed, 90 insertions(+), 114 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c859ec4544bc..bb3adca53d93 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3437,9 +3437,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, unsigned flags, struct page **pagep, void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a96b688a0410..cb42b2245c21 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -729,40 +729,83 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret, no_expand; + handle_t *handle = ext4_journal_current_handle(); + int no_expand; void *kaddr; struct ext4_iloc iloc; + int ret = 0, ret2; if (unlikely(copied < len) && !PageUptodate(page)) - return 0; + copied = 0; - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - return ret; - } + if (likely(copied)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + unlock_page(page); + put_page(page); + ext4_std_error(inode->i_sb, ret); + goto out; + } + ext4_write_lock_xattr(inode, &no_expand); + BUG_ON(!ext4_has_inline_data(inode)); - ext4_write_lock_xattr(inode, &no_expand); - BUG_ON(!ext4_has_inline_data(inode)); + /* + * ei->i_inline_off may have changed since + * ext4_write_begin() called + * ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); - /* - * ei->i_inline_off may have changed since ext4_write_begin() - * called ext4_try_to_write_inline_data() - */ - (void) ext4_find_inline_data_nolock(inode); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); + ext4_write_unlock_xattr(inode, &no_expand); + brelse(iloc.bh); - ext4_write_unlock_xattr(inode, &no_expand); - brelse(iloc.bh); - mark_inode_dirty(inode); + /* + * It's important to update i_size while still holding page + * lock: page writeout could otherwise come in and zero + * beyond i_size. + */ + ext4_update_inode_size(inode, pos + copied); + } + unlock_page(page); + put_page(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (likely(copied)) + mark_inode_dirty(inode); +out: + /* + * If we didn't copy as much data as expected, we need to trim back + * size of xattr containing inline data. + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); - return copied; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret ? ret : copied; } struct buffer_head * @@ -943,43 +986,6 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, return ret; } -int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - int ret; - - ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ret; - } - copied = ret; - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - unlock_page(page); - put_page(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - mark_inode_dirty(inode); - - return copied; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 16bc65609496..36663a118b6c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1283,23 +1283,14 @@ static int ext4_write_end(struct file *file, loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - ret = 0; - } else - copied = block_write_end(file, mapping, pos, - len, copied, page, fsdata); + + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1320,10 +1311,9 @@ static int ext4_write_end(struct file *file, * ordering of page lock and transaction start for journaling * filesystems. */ - if (i_size_changed || inline_data) + if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); -errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1395,7 +1385,6 @@ static int ext4_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; int size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); @@ -1404,17 +1393,10 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - ret = 0; - } else if (unlikely(copied < len) && !PageUptodate(page)) { + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, page, from, to); } else { @@ -1437,13 +1419,12 @@ static int ext4_journalled_write_end(struct file *file, if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - if (size_changed || inline_data) { + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } -errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -3078,7 +3059,7 @@ static int ext4_da_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - int ret = 0, ret2; + int ret; handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; @@ -3089,6 +3070,12 @@ static int ext4_da_write_end(struct file *file, len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; @@ -3108,26 +3095,12 @@ static int ext4_da_write_end(struct file *file, * ext4_da_write_inline_data_end(). */ new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size) { - if (ext4_has_inline_data(inode) || - ext4_da_should_update_i_disksize(page, end)) - ext4_update_i_disksize(inode, new_i_size); - } - - if (write_mode != CONVERT_INLINE_DATA && - ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && - ext4_has_inline_data(inode)) - ret = ext4_da_write_inline_data_end(inode, pos, len, copied, - page); - else - ret = generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - - copied = ret; - ret2 = ext4_journal_stop(handle); - if (unlikely(ret2 && !ret)) - ret = ret2; + if (copied && new_i_size > inode->i_size && + ext4_da_should_update_i_disksize(page, end)) + ext4_update_i_disksize(inode, new_i_size); + copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = ext4_journal_stop(handle); return ret ? ret : copied; } -- Gitee From cf86aec227618b09fbfc366f2fdd1f33c4e0e1d1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 15 Nov 2021 19:36:01 +0800 Subject: [PATCH 037/134] ext4: drop unnecessary journal handle in delalloc write mainline inclusion from mainline-5.15-rc4 commit cc883236b79297f6266ca6f4e7f24f3fd3c736c1 category: perf issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cc883236b79297f6266ca6f4e7f24f3fd3c736c1 Signed-off-by: Yu Changchun --------------------------- After we factor out the inline data write procedure from ext4_da_write_end(), we don't need to start journal handle for the cases of both buffer overwrite and append-write. If we need to update i_disksize, mark_inode_dirty() do start handle and update inode buffer. So we could just remove all the journal handle codes in the delalloc write procedure. After this patch, we could get a lot of performance improvement. Below is the Unixbench comparison data test on my machine with 'Intel Xeon Gold 5120' CPU and nvme SSD backend. Test cmd: ./Run -c 56 -i 3 fstime fsbuffer fsdisk Before this patch: System Benchmarks Partial Index BASELINE RESULT INDEX File Copy 1024 bufsize 2000 maxblocks 3960.0 422965.0 1068.1 File Copy 256 bufsize 500 maxblocks 1655.0 105077.0 634.9 File Copy 4096 bufsize 8000 maxblocks 5800.0 1429092.0 2464.0 ====== System Benchmarks Index Score (Partial Only) 1186.6 After this patch: System Benchmarks Partial Index BASELINE RESULT INDEX File Copy 1024 bufsize 2000 maxblocks 3960.0 732716.0 1850.3 File Copy 256 bufsize 500 maxblocks 1655.0 184940.0 1117.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 2427152.0 4184.7 ====== System Benchmarks Index Score (Partial Only) 2053.0 Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210716122024.1105856-5-yi.zhang@huawei.com Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/inode.c | 60 +++++-------------------------------------------- 1 file changed, 5 insertions(+), 55 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 36663a118b6c..dd137a1c648f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2916,19 +2916,6 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -/* We always reserve for an inode update; the superblock could be there too */ -static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) -{ - if (likely(ext4_has_feature_large_file(inode->i_sb))) - return 1; - - if (pos + len <= 0x7fffffffULL) - return 1; - - /* We might need to update the superblock to set LARGE_FILE */ - return 2; -} - static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2937,7 +2924,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page *page; pgoff_t index; struct inode *inode = mapping->host; - handle_t *handle; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -2963,41 +2949,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, return 0; } - /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page - * is being written back. So grab it first before we start - * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. - */ -retry_grab: +retry: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; - unlock_page(page); - - /* - * With delayed allocation, we don't log the i_disksize update - * if there is delayed block allocation. But we still need - * to journalling the i_disksize update if writes to the end - * of file which has an already mapped buffer. - */ -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_da_write_credits(inode, pos, len)); - if (IS_ERR(handle)) { - put_page(page); - return PTR_ERR(handle); - } - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); - ext4_journal_stop(handle); - goto retry_grab; - } /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); @@ -3009,20 +2965,18 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, #endif if (ret < 0) { unlock_page(page); - ext4_journal_stop(handle); + put_page(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - - put_page(page); + goto retry; return ret; } @@ -3059,8 +3013,6 @@ static int ext4_da_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - int ret; - handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; @@ -3099,9 +3051,7 @@ static int ext4_da_write_end(struct file *file, ext4_da_should_update_i_disksize(page, end)) ext4_update_i_disksize(inode, new_i_size); - copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - ret = ext4_journal_stop(handle); - return ret ? ret : copied; + return generic_write_end(file, mapping, pos, len, copied, page, fsdata); } /* -- Gitee From 382e8f3d498541f32b002802deb556512c44b77c Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 15 Nov 2021 19:36:03 +0800 Subject: [PATCH 038/134] powerpc/powernv/pci: fix a RCU-list lock mainline inclusion from mainline-v5.12-rc1 commit c9790fb5df461c91d3fff1d864c1acb8baf5ad5c category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c9790fb5df461c91d3fff1d864c1acb8baf5ad5c Signed-off-by: Yu Changchun --------------------------- It is unsafe to traverse tbl->it_group_list without the RCU read lock. WARNING: suspicious RCU usage 5.7.0-rc4-next-20200508 #1 Not tainted ----------------------------- arch/powerpc/platforms/powernv/pci-ioda-tce.c:355 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by qemu-kvm/4305: #0: c000000bc3fe6988 (&container->group_lock){++++}-{3:3}, at: vfio_fops_unl_ioctl+0x108/0x410 [vfio] #1: c00800000fcc7400 (&vfio.iommu_drivers_lock){+.+.}-{3:3}, at: vfio_fops_unl_ioctl+0x148/0x410 [vfio] #2: c000000bc3fe4d68 (&container->lock){+.+.}-{3:3}, at: tce_iommu_attach_group+0x3c/0x4f0 [vfio_iommu_spapr_tce] stack backtrace: CPU: 4 PID: 4305 Comm: qemu-kvm Not tainted 5.7.0-rc4-next-20200508 #1 Call Trace: [c0000010f29afa60] [c0000000007154c8] dump_stack+0xfc/0x174 (unreliable) [c0000010f29afab0] [c0000000001d8ff0] lockdep_rcu_suspicious+0x140/0x164 [c0000010f29afb30] [c0000000000dae2c] pnv_pci_unlink_table_and_group+0x11c/0x200 [c0000010f29afb70] [c0000000000d4a34] pnv_pci_ioda2_unset_window+0xc4/0x190 [c0000010f29afbf0] [c0000000000d4b4c] pnv_ioda2_take_ownership+0x4c/0xd0 [c0000010f29afc30] [c00800000fd60ee0] tce_iommu_attach_group+0x2c8/0x4f0 [vfio_iommu_spapr_tce] [c0000010f29afcd0] [c00800000fcc11a0] vfio_fops_unl_ioctl+0x238/0x410 [vfio] [c0000010f29afd50] [c0000000005430a8] ksys_ioctl+0xd8/0x130 [c0000010f29afda0] [c000000000543128] sys_ioctl+0x28/0x40 [c0000010f29afdc0] [c000000000038af4] system_call_exception+0x114/0x1e0 [c0000010f29afe20] [c00000000000c8f0] system_call_common+0xf0/0x278 Signed-off-by: Qian Cai Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200510051347.1906-1-cai@lca.pw Signed-off-by: Chen Lifu Reviewed-by: Zhang Jianhua Reviewed-by: He Ying Reviewed-by: Yihang Xu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 5218f5da2737..30551bbd7988 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -380,6 +380,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, /* Remove link to a group from table's list of attached groups */ found = false; + + rcu_read_lock(); list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { if (tgl->table_group == table_group) { list_del_rcu(&tgl->next); @@ -388,6 +390,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, break; } } + rcu_read_unlock(); + if (WARN_ON(!found)) return; -- Gitee From f75e5a6455f0749885ee4cf500c5fa29d5ba50a8 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Mon, 15 Nov 2021 19:36:04 +0800 Subject: [PATCH 039/134] powerpc/numa: Update cpu_cpu_map on CPU online/offline mainline inclusion from mainline-v5.15-rc1 commit 9a245d0e1f006bc7ccf0285d0d520ed304d00c4a category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9a245d0e1f006bc7ccf0285d0d520ed304d00c4a Signed-off-by: Yu Changchun --------------------------- cpu_cpu_map holds all the CPUs in the DIE. However in PowerPC, when onlining/offlining of CPUs, this mask doesn't get updated. This mask is however updated when CPUs are added/removed. So when both operations like online/offline of CPUs and adding/removing of CPUs are done simultaneously, then cpumaps end up broken. WARNING: CPU: 13 PID: 1142 at kernel/sched/topology.c:898 build_sched_domains+0xd48/0x1720 Modules linked in: rpadlpar_io rpaphp mptcp_diag xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag bonding tls nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set rfkill nf_tables nfnetlink pseries_rng xts vmx_crypto uio_pdrv_genirq uio binfmt_misc ip_tables xfs libcrc32c dm_service_time sd_mod t10_pi sg ibmvfc scsi_transport_fc ibmveth dm_multipath dm_mirror dm_region_hash dm_log dm_mod fuse CPU: 13 PID: 1142 Comm: kworker/13:2 Not tainted 5.13.0-rc6+ #28 Workqueue: events cpuset_hotplug_workfn NIP: c0000000001caac8 LR: c0000000001caac4 CTR: 00000000007088ec REGS: c00000005596f220 TRAP: 0700 Not tainted (5.13.0-rc6+) MSR: 8000000000029033 CR: 48828222 XER: 00000009 CFAR: c0000000001ea698 IRQMASK: 0 GPR00: c0000000001caac4 c00000005596f4c0 c000000001c4a400 0000000000000036 GPR04: 00000000fffdffff c00000005596f1d0 0000000000000027 c0000018cfd07f90 GPR08: 0000000000000023 0000000000000001 0000000000000027 c0000018fe68ffe8 GPR12: 0000000000008000 c00000001e9d1880 c00000013a047200 0000000000000800 GPR16: c000000001d3c7d0 0000000000000240 0000000000000048 c000000010aacd18 GPR20: 0000000000000001 c000000010aacc18 c00000013a047c00 c000000139ec2400 GPR24: 0000000000000280 c000000139ec2520 c000000136c1b400 c000000001c93060 GPR28: c00000013a047c20 c000000001d3c6c0 c000000001c978a0 000000000000000d NIP [c0000000001caac8] build_sched_domains+0xd48/0x1720 LR [c0000000001caac4] build_sched_domains+0xd44/0x1720 Call Trace: [c00000005596f4c0] [c0000000001caac4] build_sched_domains+0xd44/0x1720 (unreliable) [c00000005596f670] [c0000000001cc5ec] partition_sched_domains_locked+0x3ac/0x4b0 [c00000005596f710] [c0000000002804e4] rebuild_sched_domains_locked+0x404/0x9e0 [c00000005596f810] [c000000000283e60] rebuild_sched_domains+0x40/0x70 [c00000005596f840] [c000000000284124] cpuset_hotplug_workfn+0x294/0xf10 [c00000005596fc60] [c000000000175040] process_one_work+0x290/0x590 [c00000005596fd00] [c0000000001753c8] worker_thread+0x88/0x620 [c00000005596fda0] [c000000000181704] kthread+0x194/0x1a0 [c00000005596fe10] [c00000000000ccec] ret_from_kernel_thread+0x5c/0x70 Instruction dump: 485af049 60000000 2fa30800 409e0028 80fe0000 e89a00f8 e86100e8 38da0120 7f88e378 7ce53b78 4801fb91 60000000 <0fe00000> 39000000 38e00000 38c00000 Fix this by updating cpu_cpu_map aka cpumask_of_node() on every CPU online/offline. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-5-srikar@linux.vnet.ibm.com Signed-off-by: Chen Lifu Reviewed-by: Zhang Jianhua Reviewed-by: Linruizhe Reviewed-by: He Ying Reviewed-by: Yihang Xu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- arch/powerpc/include/asm/topology.h | 13 +++++++++++++ arch/powerpc/kernel/smp.c | 3 +++ arch/powerpc/mm/numa.c | 7 ++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 3beeb030cd78..424635b24b52 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -65,6 +65,11 @@ static inline int early_cpu_to_node(int cpu) int of_drconf_to_nid_single(struct drmem_lmb *lmb); +extern void map_cpu_to_node(int cpu, int node); +#ifdef CONFIG_HOTPLUG_CPU +extern void unmap_cpu_from_node(unsigned long cpu); +#endif /* CONFIG_HOTPLUG_CPU */ + #else static inline int early_cpu_to_node(int cpu) { return 0; } @@ -93,6 +98,14 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) return first_online_node; } + +#ifdef CONFIG_SMP +static inline void map_cpu_to_node(int cpu, int node) {} +#ifdef CONFIG_HOTPLUG_CPU +static inline void unmap_cpu_from_node(unsigned long cpu) {} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 452cbf98bfd7..b7bed52a5a73 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1300,6 +1300,8 @@ static void remove_cpu_from_masks(int cpu) struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; + unmap_cpu_from_node(cpu); + if (shared_caches) mask_fn = cpu_l2_cache_mask; @@ -1384,6 +1386,7 @@ static void add_cpu_to_masks(int cpu) * This CPU will not be in the online mask yet so we need to manually * add it to it's own thread sibling mask. */ + map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(cpu)); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 094a1076fd1f..23bfe00811ae 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -137,7 +137,7 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); @@ -148,7 +148,7 @@ static void map_cpu_to_node(int cpu, int node) } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -598,9 +598,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu) static int ppc_numa_cpu_dead(unsigned int cpu) { -#ifdef CONFIG_HOTPLUG_CPU - unmap_cpu_from_node(cpu); -#endif return 0; } -- Gitee From c9ed5be0f1a10e6b2c40330f2af4f99b4d286983 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 Nov 2021 19:36:05 +0800 Subject: [PATCH 040/134] mm, vmscan: guarantee drop_slab_node() termination mainline inclusion from mainline-v5.15-rc1 commit 1399af7e54896c774d67f1c1acc491b07149421d category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1399af7e54896c774d67f1c1acc491b07149421d Signed-off-by: Yu Changchun ----------------------------------------------- drop_slab_node() is called as part of echo 2>/proc/sys/vm/drop_caches operation. It iterates over all memcgs and calls shrink_slab() which in turn iterates over all slab shrinkers. Freed objects are counted and as long as the total number of freed objects from all memcgs and shrinkers is higher than 10, drop_slab_node() loops for another full memcgs*shrinkers iteration. This arbitrary constant threshold of 10 can result in effectively an infinite loop on a system with large number of memcgs and/or parallel activity that allocates new objects. This has been reported previously by Chunxin Zang [1] and recently by our customer. The previous report [1] has resulted in commit 069c411de40a ("mm/vmscan: fix infinite loop in drop_slab_node") which added a check for signals allowing the user to terminate the command writing to drop_caches. At the time it was also considered to make the threshold grow with each iteration to guarantee termination, but such patch hasn't been formally proposed yet. This patch implements the dynamically growing threshold. At first iteration it's enough to free one object to continue, and this threshold effectively doubles with each iteration. Our customer's feedback was positive. There is always a risk that this change will result on some system in a previously terminating drop_caches operation to terminate sooner and free fewer objects. Ideally the semantics would guarantee freeing all freeable objects that existed at the moment of starting the operation, while not looping forever for newly allocated objects, but that's not feasible to track. In the less ideal solution based on thresholds, arguably the termination guarantee is more important than the exhaustiveness guarantee. If there are reports of large regression wrt being exhaustive, we can tune how fast the threshold grows. [1] https://lore.kernel.org/lkml/20200909152047.27905-1-zangchunxin@bytedance.com/T/#u [vbabka@suse.cz: avoid undefined shift behaviour] Link: https://lkml.kernel.org/r/2f034e6f-a753-550a-f374-e4e23899d3d5@suse.cz Link: https://lkml.kernel.org/r/20210818152239.25502-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Chunxin Zang Cc: Muchun Song Cc: Chris Down Cc: Michal Hocko Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ebaa10d4dcad..9f292132ed88 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -689,6 +689,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, void drop_slab_node(int nid) { unsigned long freed; + int shift = 0; do { struct mem_cgroup *memcg = NULL; @@ -701,7 +702,7 @@ void drop_slab_node(int nid) do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - } while (freed > 10); + } while ((freed >> shift++) > 1); } void drop_slab(void) -- Gitee From e60d685e36f102c405183d2dbac196ba658cfb7f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 15 Nov 2021 19:36:06 +0800 Subject: [PATCH 041/134] mm: memcontrol: set the correct memcg swappiness restriction mainline inclusion from mainline-5.15-rc1 commit 37bc3cb9bbef86d1ddbbc789e55b588c8a2cac26 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37bc3cb9bbef86d1ddbbc789e55b588c8a2cac26 Signed-off-by: Yu Changchun -------------------------------- Since commit c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") has expended the swappiness value to make swap to be preferred in some systems. We should also change the memcg swappiness restriction to allow memcg swap-preferred. Link: https://lkml.kernel.org/r/d77469b90c45c49953ccbc51e54a1d465bc18f70.1627626255.git.baolin.wang@linux.alibaba.com Fixes: c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") Signed-off-by: Baolin Wang Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: mm/memcontrol.c Signed-off-by: Lu Jialin Reviewed-by: Kefeng Wang Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00156aebbcad..167169b3907d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4184,7 +4184,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val > 100) + if (val > 200) return -EINVAL; if (css->parent) -- Gitee From 6a07e27e948035e4b75ab802abb26b4c33bfc760 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 3 Dec 2021 18:15:16 +0800 Subject: [PATCH 042/134] hugetlb: before freeing hugetlb page set dtor to appropriate value mainline inclusion from mainline-v5.15-rc1 commit e32d20c0c88b1cd0a44f882c4f0eb2f536363d1b category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e32d20c0c88b1cd0a44f882c4f0eb2f536363d1b Signed-off-by: Yu Changchun ---------------------------------------------------------------------- When removing a hugetlb page from the pool the ref count is set to one (as the free page has no ref count) and compound page destructor is set to NULL_COMPOUND_DTOR. Since a subsequent call to free the hugetlb page will call __free_pages for non-gigantic pages and free_gigantic_page for gigantic pages the destructor is not used. However, consider the following race with code taking a speculative reference on the page: Thread 0 Thread 1 -------- -------- remove_hugetlb_page set_page_refcounted(page); set_compound_page_dtor(page, NULL_COMPOUND_DTOR); get_page_unless_zero(page) __update_and_free_page __free_pages(page, huge_page_order(h)); /* Note that __free_pages() will simply drop the reference to the page. */ put_page(page) __put_compound_page() destroy_compound_page NULL_COMPOUND_DTOR BUG: kernel NULL pointer dereference, address: 0000000000000000 To address this race, set the dtor to the normal compound page dtor for non-gigantic pages. The dtor for gigantic pages does not matter as gigantic pages are changed from a compound page to 'just a group of pages' before freeing. Hence, the destructor is not used. Link: https://lkml.kernel.org/r/20210809184832.18342-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Michal Hocko Cc: Oscar Salvador Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Mina Almasry Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Chen Wandun Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai Conflicts: mm/hugetlb.c [use update_and_free_page instead of remove_hugetlb_page] Signed-off-by: Yu Changchun --- mm/hugetlb.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 17679e80fbc4..4ecf9a0622df 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1353,9 +1353,26 @@ static void update_and_free_page(struct hstate *h, struct page *page) } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_page will turn the compound page + * into a simple group of pages. After this the destructor does not + * apply. + * + * This handles the case where more than one ref is held when and + * after update_and_free_page is called. + */ set_page_refcounted(page); if (hstate_is_gigantic(h)) { + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); /* * Temporarily drop the hugetlb_lock, because * we might block in free_gigantic_page(). @@ -1365,6 +1382,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) free_gigantic_page(page, huge_page_order(h)); spin_lock(&hugetlb_lock); } else { + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); __free_pages(page, huge_page_order(h)); } } -- Gitee From 0f7a6061e38f45eb537121453a27d7fe9f428052 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 15 Nov 2021 19:39:01 +0800 Subject: [PATCH 043/134] sched/topology: Fix sched_domain_topology_level alloc in sched_init_numa() mainline inclusion from mainline-v5.12-rc1 commit 71e5f6644fb2f3304fcb310145ded234a37e7cc1 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=71e5f6644fb2f3304fcb310145ded234a37e7cc1 Signed-off-by: Yu Changchun ---------------------------------------------------------- Commit "sched/topology: Make sched_init_numa() use a set for the deduplicating sort" allocates 'i + nr_levels (level)' instead of 'i + nr_levels + 1' sched_domain_topology_level. This led to an Oops (on Arm64 juno with CONFIG_SCHED_DEBUG): sched_init_domains build_sched_domains() __free_domain_allocs() __sdt_free() { ... for_each_sd_topology(tl) ... sd = *per_cpu_ptr(sdd->sd, j); <-- ... } Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Vincent Guittot Tested-by: Barry Song Link: https://lkml.kernel.org/r/6000e39e-7d28-c360-9cd6-8798fd22a9bf@arm.com Fixes: 620a6dc40754 ("sched/topology: Make sched_init_numa() use a set for the deduplicating sort") Signed-off-by: Jialin Zhang Reviewed-by: Cheng Jian Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 84cf39842efe..004e9505f7ad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1718,7 +1718,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + nr_levels) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; -- Gitee From ffcd1a14601969fed352e2f89da2432ea1111d6b Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Mon, 15 Nov 2021 19:40:32 +0800 Subject: [PATCH 044/134] mm: rmap: explicitly reset vma->anon_vma in unlink_anon_vmas() mainline inclusion from mainline-v5.12-rc1 commit ee8ab1903e3d912d8f10bedbf96c3b6a1c8cbede category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ee8ab1903e3d912d8f10bedbf96c3b6a1c8cbede Signed-off-by: Yu Changchun ------------------------------------------------- In case the vma will continue to be used after unlink its relevant anon_vma, we need to reset the vma->anon_vma pointer to NULL. So, later when fault happen within this vma again, a new anon_vma will be prepared. By this way, the vma will only be checked for reverse mapping of pages which been fault in after the unlink_anon_vmas call. Currently, the mremap with MREMAP_DONTUNMAP scenario will continue use the vma after moved its page table entries to a new vma. For other scenarios, the vma itself will be freed after call unlink_anon_vmas. Link: https://lkml.kernel.org/r/20210119075126.3513154-1-lixinhai.lxh@gmail.com Signed-off-by: Li Xinhai Cc: Andrea Arcangeli Cc: Brian Geffon Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: Minchan Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Nanyong Sun Reviewed-by: tong tiangen Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- mm/rmap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index 14f84f70c557..cdf549f6f617 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -413,8 +413,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) + if (vma->anon_vma) { vma->anon_vma->degree--; + + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; + } unlock_anon_vma_root(root); /* -- Gitee From 9c440f6bb3ff9994fb38773aca15b25f64edb7af Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 15 Nov 2021 19:40:33 +0800 Subject: [PATCH 045/134] quota: check block number when reading the block in quota file maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- The block number in the quota tree on disk should be smaller than the v2_disk_dqinfo.dqi_blocks. If the quota file was corrupted, we may be allocating an 'allocated' block and that would lead to a loop in a tree, which will probably trigger oops later. This patch adds a check for the block number in the quota tree to prevent such potential issue. Link: https://lore.kernel.org/r/20211008093821.1001186-2-yi.zhang@huawei.com Signed-off-by: Zhang Yi Cc: stable@kernel.org Signed-off-by: Jan Kara Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/quota/quota_tree.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c5562c871c8b..c459ac0cb2c7 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -488,6 +488,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + newblk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); newblk = 0; @@ -587,6 +594,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; + if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + blk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); else -- Gitee From 76b5c1db4b91ff66910b8240e68babe2d0d708b7 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 15 Nov 2021 19:40:34 +0800 Subject: [PATCH 046/134] quota: correct error number in free_dqentry() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Fix the error path in free_dqentry(), pass out the error number if the block to free is not correct. Fixes: 1ccd14b9c271 ("quota: Split off quota tree handling into a separate file") Link: https://lore.kernel.org/r/20211008093821.1001186-3-yi.zhang@huawei.com Signed-off-by: Zhang Yi Cc: stable@kernel.org Signed-off-by: Jan Kara Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/quota/quota_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c459ac0cb2c7..1a188fbdf34e 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -423,6 +423,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); -- Gitee From 223ad2904adcf704fe7694af2bc068404321eacb Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 15 Nov 2021 19:40:35 +0800 Subject: [PATCH 047/134] ext4: check for out-of-order index extents in ext4_valid_extent_entries() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- After commit 5946d089379a ("ext4: check for overlapping extents in ext4_valid_extent_entries()"), we can check out the overlapping extent entry in leaf extent blocks. But the out-of-order extent entry in index extent blocks could also trigger bad things if the filesystem is inconsistent. So this patch add a check to figure out the out-of-order index extents and return error. Signed-off-by: Zhang Yi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210908120850.4012324-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aa4d74f9d162..1d09a4c295f7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -356,6 +356,9 @@ static int ext4_valid_extent_entries(struct inode *inode, ext4_fsblk_t *pblk, int depth) { unsigned short entries; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + if (eh->eh_entries == 0) return 1; @@ -364,31 +367,35 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); - ext4_lblk_t lblock = 0; - ext4_lblk_t prev = 0; - int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); - len = ext4_ext_get_actual_len(ext); if ((lblock <= prev) && prev) { *pblk = ext4_ext_pblock(ext); return 0; } + prev = lblock + ext4_ext_get_actual_len(ext) - 1; ext++; entries--; - prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; + + /* Check for overlapping index extents */ + lblock = le32_to_cpu(ext_idx->ei_block); + if ((lblock <= prev) && prev) { + *pblk = ext4_idx_pblock(ext_idx); + return 0; + } ext_idx++; entries--; + prev = lblock; } } return 1; -- Gitee From 2472299e578d8e5540d9384d97f31cc8bb855b20 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 15 Nov 2021 19:40:36 +0800 Subject: [PATCH 048/134] ext4: check for inconsistent extents between index and leaf block maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Now that we can check out overlapping extents in leaf block and out-of-order index extents in index block. But the .ee_block in the first extent of one leaf block should equal to the .ei_block in it's parent index extent entry. This patch add a check to verify such inconsistent between the index and leaf block. Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20210908120850.4012324-3-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 59 +++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1d09a4c295f7..39ea4f0b240c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -353,7 +353,8 @@ static int ext4_valid_extent_idx(struct inode *inode, static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, - ext4_fsblk_t *pblk, int depth) + ext4_lblk_t lblk, ext4_fsblk_t *pblk, + int depth) { unsigned short entries; ext4_lblk_t lblock = 0; @@ -367,6 +368,14 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + + /* + * The logical block in the first entry should equal to + * the number in the index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext->ee_block)) + return 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; @@ -383,6 +392,14 @@ static int ext4_valid_extent_entries(struct inode *inode, } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); + + /* + * The logical block in the first entry should equal to + * the number in the parent index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext_idx->ei_block)) + return 0; while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; @@ -403,7 +420,7 @@ static int ext4_valid_extent_entries(struct inode *inode, static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth, ext4_fsblk_t pblk) + int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; @@ -429,7 +446,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } - if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) { + if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } @@ -459,7 +476,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, } #define ext4_ext_check(inode, eh, depth, pblk) \ - __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) int ext4_ext_check_inode(struct inode *inode) { @@ -492,16 +509,18 @@ static void ext4_cache_extents(struct inode *inode, static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, - struct inode *inode, ext4_fsblk_t pblk, int depth, - int flags) + struct inode *inode, struct ext4_extent_idx *idx, + int depth, int flags) { struct buffer_head *bh; int err; gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; + ext4_fsblk_t pblk; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; + pblk = ext4_idx_pblock(idx); bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -514,8 +533,8 @@ __read_extent_tree_block(const char *function, unsigned int line, } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); + err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), + depth, pblk, le32_to_cpu(idx->ei_block)); if (err) goto errout; set_buffer_verified(bh); @@ -533,8 +552,8 @@ __read_extent_tree_block(const char *function, unsigned int line, } -#define read_extent_tree_block(inode, pblk, depth, flags) \ - __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ +#define read_extent_tree_block(inode, idx, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ (depth), (flags)) /* @@ -584,8 +603,7 @@ int ext4_ext_precache(struct inode *inode) i--; continue; } - bh = read_extent_tree_block(inode, - ext4_idx_pblock(path[i].p_idx++), + bh = read_extent_tree_block(inode, path[i].p_idx++, depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { @@ -890,8 +908,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = read_extent_tree_block(inode, path[ppos].p_block, --i, - flags); + bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; @@ -1496,7 +1513,6 @@ static int ext4_ext_search_right(struct inode *inode, struct ext4_extent_header *eh; struct ext4_extent_idx *ix; struct ext4_extent *ex; - ext4_fsblk_t block; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; @@ -1563,20 +1579,17 @@ static int ext4_ext_search_right(struct inode *inode, * follow it and find the closest allocated * block to the right */ ix++; - block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, block, - path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); - block = ext4_idx_pblock(ix); put_bh(bh); } - bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -2955,9 +2968,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = read_extent_tree_block(inode, - ext4_idx_pblock(path[i].p_idx), depth - i - 1, - EXT4_EX_NOCACHE); + bh = read_extent_tree_block(inode, path[i].p_idx, + depth - i - 1, + EXT4_EX_NOCACHE); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); -- Gitee From ca97a933d7c51e5f9b453c7a7622756868945091 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 15 Nov 2021 19:40:37 +0800 Subject: [PATCH 049/134] ext4: prevent partial update of the extent blocks maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- In the most error path of current extents updating operations are not roll back partial updates properly when some bad things happens(.e.g in ext4_ext_insert_extent()). So we may get an inconsistent extents tree if journal has been aborted due to IO error, which may probability lead to BUGON later when we accessing these extent entries in errors=continue mode. This patch drop extent buffer's verify flag before updatng the contents in ext4_ext_get_access(), and reset it after updating in __ext4_ext_dirty(). After this patch we could force to check the extent buffer if extents tree updating was break off, make sure the extents are consistent. Signed-off-by: Zhang Yi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210908120850.4012324-4-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o conflict: fs/ext4/extents.c Reviewed-by: Yang Erkun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 39ea4f0b240c..618675a41efb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -136,14 +136,24 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { + int err = 0; + if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, path->p_bh); + err = ext4_journal_get_write_access(handle, path->p_bh); + /* + * The extent buffer's verified bit will be set again in + * __ext4_ext_dirty(). We could leave an inconsistent + * buffer if the extents updating procudure break off du + * to some error happens, force to check it again. + */ + if (!err) + clear_buffer_verified(path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ - return 0; + return err; } /* @@ -164,6 +174,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line, /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); + /* Extents updating done, re-set verified flag */ + if (!err) + set_buffer_verified(path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); -- Gitee From 580d4c359a5c7069636b1418d1015f4b335c1fdf Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:40:38 +0800 Subject: [PATCH 050/134] ubifs: Limit dumping length by size of memory which is allocated for the node mainline inclusion from mainline-v5.11-rc1 commit c4c0d19d39d26c5f58633f8fcca75f03b2854fc0 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4c0d19d39d26c5f58633f8fcca75f03b2854fc0 Signed-off-by: Yu Changchun ----------------------------------------------- To prevent memory out-of-bounds accessing in ubifs_dump_node(), actual dumping length should be restricted by another condition(size of memory which is allocated for the node). This patch handles following situations (These situations may be caused by bit flipping due to hardware error, writing bypass ubifs, unknown bugs in ubifs, etc.): 1. bad node_len: Dumping data according to 'ch->len' which may exceed the size of memory allocated for node. 2. bad node content: Some kinds of node can record additional data, eg. index node and orphan node, make sure the size of additional data not beyond the node length. 3. node_type changes: Read data according to type A, but expected type B, before that, node is allocated according to type B's size. Length of type A node is greater than type B node. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Reviewed-by: Zhang Xiaoxu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/debug.c | 63 ++++++++++++++++++++++++++++++++++++++---------- fs/ubifs/debug.h | 3 ++- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index ebff43f8009c..1cfc2855673e 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -291,9 +291,9 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) kfree(pdent); } -void ubifs_dump_node(const struct ubifs_info *c, const void *node) +void ubifs_dump_node(const struct ubifs_info *c, const void *node, int node_len) { - int i, n; + int i, n, type, safe_len, max_node_len, min_node_len; union ubifs_key key; const struct ubifs_ch *ch = node; char key_buf[DBG_KEY_BUF_LEN]; @@ -306,10 +306,40 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) return; } + /* Skip dumping unknown type node */ + type = ch->node_type; + if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { + pr_err("node type %d was not recognized\n", type); + return; + } + spin_lock(&dbg_lock); dump_ch(node); - switch (ch->node_type) { + if (c->ranges[type].max_len == 0) { + max_node_len = min_node_len = c->ranges[type].len; + } else { + max_node_len = c->ranges[type].max_len; + min_node_len = c->ranges[type].min_len; + } + safe_len = le32_to_cpu(ch->len); + safe_len = safe_len > 0 ? safe_len : 0; + safe_len = min3(safe_len, max_node_len, node_len); + if (safe_len < min_node_len) { + pr_err("node len(%d) is too short for %s, left %d bytes:\n", + safe_len, dbg_ntype(type), + safe_len > UBIFS_CH_SZ ? + safe_len - (int)UBIFS_CH_SZ : 0); + if (safe_len > UBIFS_CH_SZ) + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1, + (void *)node + UBIFS_CH_SZ, + safe_len - UBIFS_CH_SZ, 0); + goto out_unlock; + } + if (safe_len != le32_to_cpu(ch->len)) + pr_err("\ttruncated node length %d\n", safe_len); + + switch (type) { case UBIFS_PAD_NODE: { const struct ubifs_pad_node *pad = node; @@ -453,7 +483,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) pr_err("\tnlen %d\n", nlen); pr_err("\tname "); - if (nlen > UBIFS_MAX_NLEN) + if (nlen > UBIFS_MAX_NLEN || + nlen > safe_len - UBIFS_DENT_NODE_SZ) pr_err("(bad name length, not printing, bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) @@ -467,7 +498,6 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) case UBIFS_DATA_NODE: { const struct ubifs_data_node *dn = node; - int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); pr_err("\tkey %s\n", @@ -475,10 +505,13 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) pr_err("\tsize %u\n", le32_to_cpu(dn->size)); pr_err("\tcompr_typ %d\n", (int)le16_to_cpu(dn->compr_type)); - pr_err("\tdata size %d\n", dlen); - pr_err("\tdata:\n"); + pr_err("\tdata size %u\n", + le32_to_cpu(ch->len) - (unsigned int)UBIFS_DATA_NODE_SZ); + pr_err("\tdata (length = %d):\n", + safe_len - (int)UBIFS_DATA_NODE_SZ); print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, - (void *)&dn->data, dlen, 0); + (void *)&dn->data, + safe_len - (int)UBIFS_DATA_NODE_SZ, 0); break; } case UBIFS_TRUN_NODE: @@ -495,9 +528,12 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) case UBIFS_IDX_NODE: { const struct ubifs_idx_node *idx = node; + int max_child_cnt = (safe_len - UBIFS_IDX_NODE_SZ) / + (ubifs_idx_node_sz(c, 1) - + UBIFS_IDX_NODE_SZ); - n = le16_to_cpu(idx->child_cnt); - pr_err("\tchild_cnt %d\n", n); + n = min_t(int, le16_to_cpu(idx->child_cnt), max_child_cnt); + pr_err("\tchild_cnt %d\n", (int)le16_to_cpu(idx->child_cnt)); pr_err("\tlevel %d\n", (int)le16_to_cpu(idx->level)); pr_err("\tBranches:\n"); @@ -525,7 +561,7 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) le64_to_cpu(orph->cmt_no) & LLONG_MAX); pr_err("\tlast node flag %llu\n", (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); - n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; + n = (safe_len - UBIFS_ORPH_NODE_SZ) >> 3; pr_err("\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) pr_err("\t ino %llu\n", @@ -537,9 +573,10 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) break; } default: - pr_err("node type %d was not recognized\n", - (int)ch->node_type); + pr_err("node type %d was not recognized\n", type); } + +out_unlock: spin_unlock(&dbg_lock); } diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 7763639a426b..42610fa5f3a7 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -242,7 +242,8 @@ const char *dbg_get_key_dump(const struct ubifs_info *c, const char *dbg_snprintf_key(const struct ubifs_info *c, const union ubifs_key *key, char *buffer, int len); void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode); -void ubifs_dump_node(const struct ubifs_info *c, const void *node); +void ubifs_dump_node(const struct ubifs_info *c, const void *node, + int node_len); void ubifs_dump_budget_req(const struct ubifs_budget_req *req); void ubifs_dump_lstats(const struct ubifs_lp_stats *lst); void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); -- Gitee From 1a34c8287e19dce74a3eabed6476eca2b16edd44 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:40:39 +0800 Subject: [PATCH 051/134] Revert "ubifs: Fix out-of-bounds memory access caused by abnormal value of node_len" mainline inclusion from mainline-v5.11-rc1 commit c8be097530a82e004f98378c3afc5cd35efc4f57 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c8be097530a82e004f98378c3afc5cd35efc4f57 Signed-off-by: Yu Changchun ----------------------------------------------- This reverts commit acc5af3efa30 ("ubifs: Fix out-of-bounds memory access caused by abnormal value of node_len") No need to avoid memory oob in dumping for data node alone. Later, node length will be passed into function 'ubifs_dump_node()' which replaces all node dumping places. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Reviewed-by: Zhang Xiaoxu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/io.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index eae9cf5a57b0..d71aabd992fe 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -225,7 +225,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, int offs, int quiet, int must_chk_crc) { - int err = -EINVAL, type, node_len, dump_node = 1; + int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; const struct ubifs_ch *ch = buf; @@ -278,22 +278,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, out_len: if (!quiet) ubifs_err(c, "bad node length %d", node_len); - if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ) - dump_node = 0; out: if (!quiet) { ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); - if (dump_node) { - ubifs_dump_node(c, buf); - } else { - int safe_len = min3(node_len, c->leb_size - offs, - (int)UBIFS_MAX_DATA_NODE_SZ); - pr_err("\tprevent out-of-bounds memory access\n"); - pr_err("\ttruncated data node length %d\n", safe_len); - pr_err("\tcorrupted data node:\n"); - print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, - buf, safe_len, 0); - } + ubifs_dump_node(c, buf); dump_stack(); } return err; -- Gitee From 2a1a0937b60cca64d048e33570045f817fd61068 Mon Sep 17 00:00:00 2001 From: Chengsong Ke Date: Mon, 15 Nov 2021 19:40:40 +0800 Subject: [PATCH 052/134] ubifs: Remove the redundant return in dbg_check_nondata_nodes_order mainline inclusion from mainline-v5.11-rc1 commit 32f6ccc743b89bb4c51d4a868ffdb6ebda2909cf category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=32f6ccc743b89bb4c51d4a868ffdb6ebda2909cf Signed-off-by: Yu Changchun ----------------------------------------------- There is a redundant return in dbg_check_nondata_nodes_order, which will be never reached. In addition, error code should be returned instead of zero in this branch. Signed-off-by: Chengsong Ke Signed-off-by: Richard Weinberger Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Xiaoxu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/debug.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 1cfc2855673e..e4cd67508b07 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2479,7 +2479,6 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) ubifs_msg(c, "dumping second node"); ubifs_dump_node(c, sb->node); return -EINVAL; - return 0; } static inline int chance(unsigned int n, unsigned int out_of) -- Gitee From 553e5179469fe0b5a1483ab58b54561b206620ff Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:40:41 +0800 Subject: [PATCH 053/134] ubifs: Pass node length in all node dumping callers mainline inclusion from mainline-v5.11-rc1 commit a33e30a0e023e9d1866866ca895c7789f48445e7 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a33e30a0e023e9d1866866ca895c7789f48445e7 Signed-off-by: Yu Changchun ----------------------------------------------- Function ubifs_dump_node() has been modified to avoid memory oob accessing while dumping node, node length (corresponding to the size of allocated memory for node) should be passed into all node dumping callers. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Reviewed-by: Zhang Xiaoxu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/commit.c | 4 ++-- fs/ubifs/debug.c | 30 +++++++++++++++--------------- fs/ubifs/file.c | 2 +- fs/ubifs/io.c | 23 +++++++++++------------ fs/ubifs/journal.c | 3 ++- fs/ubifs/master.c | 4 ++-- fs/ubifs/orphan.c | 6 ++++-- fs/ubifs/recovery.c | 6 +++--- fs/ubifs/replay.c | 4 ++-- fs/ubifs/sb.c | 2 +- fs/ubifs/scan.c | 4 ++-- fs/ubifs/super.c | 2 +- fs/ubifs/tnc.c | 8 ++++---- fs/ubifs/tnc_misc.c | 4 ++-- fs/ubifs/ubifs.h | 4 ++-- 15 files changed, 54 insertions(+), 52 deletions(-) diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index b5cdac9b0368..c4fc1047fc07 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -701,13 +701,13 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) out_dump: ubifs_err(c, "dumping index node (iip=%d)", i->iip); - ubifs_dump_node(c, idx); + ubifs_dump_node(c, idx, ubifs_idx_node_sz(c, c->fanout)); list_del(&i->list); kfree(i); if (!list_empty(&list)) { i = list_entry(list.prev, struct idx_node, list); ubifs_err(c, "dumping parent index node"); - ubifs_dump_node(c, &i->idx); + ubifs_dump_node(c, &i->idx, ubifs_idx_node_sz(c, c->fanout)); } out_free: while (!list_empty(&list)) { diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index e4cd67508b07..927d880bef17 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -871,7 +871,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) cond_resched(); pr_err("Dumping node at LEB %d:%d len %d\n", lnum, snod->offs, snod->len); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, c->leb_size - snod->offs); } pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); @@ -1249,7 +1249,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, ubifs_err(c, "but it should have key %s according to tnc", dbg_snprintf_key(c, &zbr1->key, key_buf, DBG_KEY_BUF_LEN)); - ubifs_dump_node(c, dent1); + ubifs_dump_node(c, dent1, UBIFS_MAX_DENT_NODE_SZ); goto out_free; } @@ -1261,7 +1261,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, ubifs_err(c, "but it should have key %s according to tnc", dbg_snprintf_key(c, &zbr2->key, key_buf, DBG_KEY_BUF_LEN)); - ubifs_dump_node(c, dent2); + ubifs_dump_node(c, dent2, UBIFS_MAX_DENT_NODE_SZ); goto out_free; } @@ -1280,9 +1280,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); ubifs_msg(c, "first node at %d:%d\n", zbr1->lnum, zbr1->offs); - ubifs_dump_node(c, dent1); + ubifs_dump_node(c, dent1, UBIFS_MAX_DENT_NODE_SZ); ubifs_msg(c, "second node at %d:%d\n", zbr2->lnum, zbr2->offs); - ubifs_dump_node(c, dent2); + ubifs_dump_node(c, dent2, UBIFS_MAX_DENT_NODE_SZ); out_free: kfree(dent2); @@ -2147,7 +2147,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, out_dump: ubifs_msg(c, "dump of node at LEB %d:%d", zbr->lnum, zbr->offs); - ubifs_dump_node(c, node); + ubifs_dump_node(c, node, zbr->len); out_free: kfree(node); return err; @@ -2280,7 +2280,7 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) ubifs_msg(c, "dump of the inode %lu sitting in LEB %d:%d", (unsigned long)fscki->inum, zbr->lnum, zbr->offs); - ubifs_dump_node(c, ino); + ubifs_dump_node(c, ino, zbr->len); kfree(ino); return -EINVAL; } @@ -2351,12 +2351,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_DATA_NODE) { ubifs_err(c, "bad node type %d", sa->type); - ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); return -EINVAL; } if (sb->type != UBIFS_DATA_NODE) { ubifs_err(c, "bad node type %d", sb->type); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; } @@ -2387,8 +2387,8 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) return 0; error_dump: - ubifs_dump_node(c, sa->node); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; } @@ -2419,13 +2419,13 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { ubifs_err(c, "bad node type %d", sa->type); - ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); return -EINVAL; } if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE && sb->type != UBIFS_XENT_NODE) { ubifs_err(c, "bad node type %d", sb->type); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; } @@ -2475,9 +2475,9 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) error_dump: ubifs_msg(c, "dumping first node"); - ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node, c->leb_size - sa->offs); ubifs_msg(c, "dumping second node"); - ubifs_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node, c->leb_size - sb->offs); return -EINVAL; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f4826b6da682..d44c8e14810c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -92,7 +92,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, dump: ubifs_err(c, "bad data node (block %u, inode %lu)", block, inode->i_ino); - ubifs_dump_node(c, dn); + ubifs_dump_node(c, dn, UBIFS_MAX_DATA_NODE_SZ); return -EINVAL; } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index d71aabd992fe..937bd3fdad40 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -198,6 +198,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) * ubifs_check_node - check node. * @c: UBIFS file-system description object * @buf: node to check + * @len: node length * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock * @quiet: print no messages @@ -222,8 +223,8 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) * This function returns zero in case of success and %-EUCLEAN in case of bad * CRC or magic. */ -int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet, int must_chk_crc) +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, + int lnum, int offs, int quiet, int must_chk_crc) { int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; @@ -281,7 +282,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, out: if (!quiet) { ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, len); dump_stack(); } return err; @@ -718,7 +719,7 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c) int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) { struct ubifs_info *c = wbuf->c; - int err, written, n, aligned_len = ALIGN(len, 8); + int err, n, written = 0, aligned_len = ALIGN(len, 8); dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, dbg_ntype(((struct ubifs_ch *)buf)->node_type), @@ -785,8 +786,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - written = 0; - if (wbuf->used) { /* * The node is large enough and does not fit entirely within @@ -887,7 +886,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) out: ubifs_err(c, "cannot write %d bytes to LEB %d:%d, error %d", len, wbuf->lnum, wbuf->offs, err); - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, written + len); dump_stack(); ubifs_dump_leb(c, wbuf->lnum); return err; @@ -930,7 +929,7 @@ int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum, err = ubifs_leb_write(c, lnum, buf, offs, buf_len); if (err) - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, len); return err; } @@ -1013,7 +1012,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + err = ubifs_check_node(c, buf, len, lnum, offs, 0, 0); if (err) { ubifs_err(c, "expected node type %d", type); return err; @@ -1029,7 +1028,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, out: ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, len); dump_stack(); return -EINVAL; } @@ -1069,7 +1068,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + err = ubifs_check_node(c, buf, len, lnum, offs, 0, 0); if (err) { ubifs_errc(c, "expected node type %d", type); return err; @@ -1087,7 +1086,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum, offs, ubi_is_mapped(c->ubi, lnum)); if (!c->probing) { - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, len); dump_stack(); } return -EINVAL; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 7274bd23881b..230717384a38 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1560,7 +1560,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn); + ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - + UBIFS_TRUN_NODE_SZ); goto out_free; } diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 911d0555b9f2..0df9a3dd0aaa 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -314,7 +314,7 @@ static int validate_master(const struct ubifs_info *c) out: ubifs_err(c, "bad master node at offset %d error %d", c->mst_offs, err); - ubifs_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node, c->mst_node_alsz); return -EINVAL; } @@ -392,7 +392,7 @@ int ubifs_read_master(struct ubifs_info *c) if (c->leb_cnt < old_leb_cnt || c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_err(c, "bad leb_cnt on master node"); - ubifs_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node, c->mst_node_alsz); return -EINVAL; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 0fb61956146d..4909321d84cf 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -646,7 +646,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (snod->type != UBIFS_ORPH_NODE) { ubifs_err(c, "invalid node type %d in orphan area at %d:%d", snod->type, sleb->lnum, snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, + c->leb_size - snod->offs); err = -EINVAL; goto out_free; } @@ -674,7 +675,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (!first) { ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, + c->leb_size - snod->offs); err = -EINVAL; goto out_free; } diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index f116f7b3f9e5..f0d51dd21c9e 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -352,11 +352,11 @@ int ubifs_recover_master_node(struct ubifs_info *c) ubifs_err(c, "failed to recover master node"); if (mst1) { ubifs_err(c, "dumping first master node"); - ubifs_dump_node(c, mst1); + ubifs_dump_node(c, mst1, c->leb_size - ((void *)mst1 - buf1)); } if (mst2) { ubifs_err(c, "dumping second master node"); - ubifs_dump_node(c, mst2); + ubifs_dump_node(c, mst2, c->leb_size - ((void *)mst2 - buf2)); } vfree(buf2); vfree(buf1); @@ -469,7 +469,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, * The area after the common header size is not empty, so the common * header must be intact. Check it. */ - if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + if (ubifs_check_node(c, buf, len, lnum, offs, 1, 0) != -EUCLEAN) { dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); return 0; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b2f5563d1489..6283adeb21e0 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -830,7 +830,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) out_dump: ubifs_err(c, "bad node is at LEB %d:%d", lnum, snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, c->leb_size - snod->offs); ubifs_scan_destroy(sleb); return -EINVAL; } @@ -1126,7 +1126,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) out_dump: ubifs_err(c, "log error detected while replaying the log at LEB %d:%d", lnum, offs + snod->offs); - ubifs_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node, c->leb_size - snod->offs); ubifs_scan_destroy(sleb); return -EINVAL; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index c0d3e4008d23..c160f718c288 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -503,7 +503,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) failed: ubifs_err(c, "bad superblock, error %d", err); - ubifs_dump_node(c, sup); + ubifs_dump_node(c, sup, ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size)); return -EINVAL; } diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index c69cdb5e65bc..84a9157dcc32 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -76,7 +76,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, dbg_scan("scanning %s at LEB %d:%d", dbg_ntype(ch->node_type), lnum, offs); - if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) + if (ubifs_check_node(c, buf, len, lnum, offs, quiet, 1)) return SCANNED_A_CORRUPT_NODE; if (ch->node_type == UBIFS_PAD_NODE) { @@ -90,7 +90,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (!quiet) { ubifs_err(c, "bad pad node at LEB %d:%d", lnum, offs); - ubifs_dump_node(c, pad); + ubifs_dump_node(c, pad, len); } return SCANNED_A_BAD_PAD_NODE; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index cfd46753a685..98cbd9cac246 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -253,7 +253,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) out_invalid: ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); - ubifs_dump_node(c, ino); + ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 894f1ab14616..414266d4d1ba 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -316,7 +316,7 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, dent); if (err) { dump_stack(); - ubifs_dump_node(c, dent); + ubifs_dump_node(c, dent, zbr->len); return err; } @@ -349,7 +349,7 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, node); if (err) { dump_stack(); - ubifs_dump_node(c, node); + ubifs_dump_node(c, node, zbr->len); return err; } @@ -1699,7 +1699,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf, goto out_err; } - err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + err = ubifs_check_node(c, buf, zbr->len, zbr->lnum, zbr->offs, 0, 0); if (err) { ubifs_err(c, "expected node type %d", UBIFS_DATA_NODE); goto out; @@ -1733,7 +1733,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf, err = -EINVAL; out: ubifs_err(c, "bad node at LEB %d:%d", zbr->lnum, zbr->offs); - ubifs_dump_node(c, buf); + ubifs_dump_node(c, buf, zbr->len); dump_stack(); return err; } diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index ccaf94ea5be3..5517a56b4138 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -390,7 +390,7 @@ static int read_znode(struct ubifs_info *c, struct ubifs_zbranch *zzbr, out_dump: ubifs_err(c, "bad indexing node at LEB %d:%d, error %d", lnum, offs, err); - ubifs_dump_node(c, idx); + ubifs_dump_node(c, idx, c->max_idx_node_sz); kfree(idx); return -EINVAL; } @@ -489,7 +489,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->lnum, zbr->offs); dbg_tnck(key, "looked for key "); dbg_tnck(&key1, "but found node's key "); - ubifs_dump_node(c, node); + ubifs_dump_node(c, node, zbr->len); return -EINVAL; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e7e48f3b179a..daad22263cb2 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1721,8 +1721,8 @@ int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, int offs); int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum, int offs, int hmac_offs); -int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet, int must_chk_crc); +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, + int lnum, int offs, int quiet, int must_chk_crc); void ubifs_init_node(struct ubifs_info *c, void *buf, int len, int pad); void ubifs_crc_node(struct ubifs_info *c, void *buf, int len); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); -- Gitee From b08b792e75345d7d13ac39d5145c0c73f3ebd135 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:40:42 +0800 Subject: [PATCH 054/134] ubifs: ubifs_dump_sleb: Remove unused function mainline inclusion from mainline-v5.11-rc1 commit bf6dab7a6ce79c56764623b970be10fc6edd8a68 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bf6dab7a6ce79c56764623b970be10fc6edd8a68 Signed-off-by: Yu Changchun ----------------------------------------------- Function ubifs_dump_sleb() is defined but unused, it can be removed. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Reviewed-by: Zhang Xiaoxu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/debug.c | 16 ---------------- fs/ubifs/debug.h | 2 -- 2 files changed, 18 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 927d880bef17..1c376ff0ec13 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -828,22 +828,6 @@ void ubifs_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } -void ubifs_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs) -{ - struct ubifs_scan_node *snod; - - pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n", - current->pid, sleb->lnum, offs); - - list_for_each_entry(snod, &sleb->nodes, list) { - cond_resched(); - pr_err("Dumping node at LEB %d:%d len %d\n", - sleb->lnum, snod->offs, snod->len); - ubifs_dump_node(c, snod->node); - } -} - void ubifs_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 42610fa5f3a7..ed966108da80 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -252,8 +252,6 @@ void ubifs_dump_lprop(const struct ubifs_info *c, void ubifs_dump_lprops(struct ubifs_info *c); void ubifs_dump_lpt_info(struct ubifs_info *c); void ubifs_dump_leb(const struct ubifs_info *c, int lnum); -void ubifs_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs); void ubifs_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, -- Gitee From f04e6c6154181e0acf0606da4968ffa9289bb7ef Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:40:43 +0800 Subject: [PATCH 055/134] ubifs: ubifs_dump_node: Dump all branches of the index node mainline inclusion from mainline-v5.11-rc1 commit b80a974b8c58164ed57b0f025a47b8f003198d9e category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b80a974b8c58164ed57b0f025a47b8f003198d9e Signed-off-by: Yu Changchun ----------------------------------------------- An index node can have up to c->fanout branches, all branches should be displayed while dumping index node. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger Reviewed-by: Zhang Xiaoxu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 1c376ff0ec13..89320c89cf0d 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -537,7 +537,7 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node, int node_len) pr_err("\tlevel %d\n", (int)le16_to_cpu(idx->level)); pr_err("\tBranches:\n"); - for (i = 0; i < n && i < c->fanout - 1; i++) { + for (i = 0; i < n && i < c->fanout; i++) { const struct ubifs_branch *br; br = ubifs_idx_branch(c, idx, i); -- Gitee From 7f2c55ca1934820d4b30568484436b6e8196582a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:42:59 +0800 Subject: [PATCH 056/134] nbd: don't handle response without a corresponding request message mainline inclusion from mainline-next-20211018 commit b5644a3a79bf3be5f1238db1b2f241374b27b0f0 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b5644a3a79bf3be5f1238db1b2f241374b27b0f0 Signed-off-by: Yu Changchun --------------------------- While handling a response message from server, nbd_read_stat() will try to get request by tag, and then complete the request. However, this is problematic if nbd haven't sent a corresponding request message: t1 t2 submit_bio nbd_queue_rq blk_mq_start_request recv_work nbd_read_stat blk_mq_tag_to_rq blk_mq_complete_request nbd_send_cmd Thus add a new cmd flag 'NBD_CMD_INFLIGHT', it will be set in nbd_send_cmd() and checked in nbd_read_stat(). Noted that this patch can't fix that blk_mq_tag_to_rq() might return a freed request, and this will be fixed in following patches. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210916093350.1410403-2-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f4b0637bb155..1b3718238588 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -122,6 +122,12 @@ struct nbd_device { }; #define NBD_CMD_REQUEUED 1 +/* + * This flag will be set if nbd_queue_rq() succeed, and will be checked and + * cleared in completion. Both setting and clearing of the flag are protected + * by cmd->lock. + */ +#define NBD_CMD_INFLIGHT 2 struct nbd_cmd { struct nbd_device *nbd; @@ -389,6 +395,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; mutex_unlock(&cmd->lock); @@ -718,6 +725,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", + tag, cmd->status, cmd->flags); + ret = -ENOENT; + goto out; + } if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); @@ -817,6 +830,7 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved) return true; mutex_lock(&cmd->lock); + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); @@ -953,7 +967,13 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) * returns EAGAIN can be retried on a different socket. */ ret = nbd_send_cmd(nbd, cmd, index); - if (ret == -EAGAIN) { + /* + * Access to this flag is protected by cmd->lock, thus it's safe to set + * the flag after nbd_send_cmd() succeed to send request to server. + */ + if (!ret) + __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); + else if (ret == -EAGAIN) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Request send failed, requeueing\n"); nbd_mark_nsock_dead(nbd, nsock, 1); -- Gitee From 4438c56e17d903d94c438144d7993a641f0cb28d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:43:00 +0800 Subject: [PATCH 057/134] nbd: make sure request completion won't concurrent mainline inclusion from mainline-next-20211018 commit d14b304f558f8c8f53da3a8d0c0b671f14a9c2f4 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d14b304f558f8c8f53da3a8d0c0b671f14a9c2f4 Signed-off-by: Yu Changchun --------------------------- commit cddce0116058 ("nbd: Aovid double completion of a request") try to fix that nbd_clear_que() and recv_work() can complete a request concurrently. However, the problem still exists: t1 t2 t3 nbd_disconnect_and_put flush_workqueue recv_work blk_mq_complete_request blk_mq_complete_request_remote -> this is true WRITE_ONCE(rq->state, MQ_RQ_COMPLETE) blk_mq_raise_softirq blk_done_softirq blk_complete_reqs nbd_complete_rq blk_mq_end_request blk_mq_free_request WRITE_ONCE(rq->state, MQ_RQ_IDLE) nbd_clear_que blk_mq_tagset_busy_iter nbd_clear_req __blk_mq_free_request blk_mq_put_tag blk_mq_complete_request -> complete again There are three places where request can be completed in nbd: recv_work(), nbd_clear_que() and nbd_xmit_timeout(). Since they all hold cmd->lock before completing the request, it's easy to avoid the problem by setting and checking a cmd flag. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210916093350.1410403-3-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1b3718238588..7271541558b6 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -395,7 +395,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; - __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return BLK_EH_DONE; + } + if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; mutex_unlock(&cmd->lock); @@ -830,7 +834,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved) return true; mutex_lock(&cmd->lock); - __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return true; + } cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); -- Gitee From f966f133626d57a2a7889b2ff02e2edac7712b2a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:43:01 +0800 Subject: [PATCH 058/134] nbd: check sock index in nbd_read_stat() mainline inclusion from mainline-next-20211018 commit dbd73178da676945d8bbcf6afe731623f683ce0a category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dbd73178da676945d8bbcf6afe731623f683ce0a Signed-off-by: Yu Changchun --------------------------- The sock that clent send request in nbd_send_cmd() and receive reply in nbd_read_stat() should be the same. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210916093350.1410403-4-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7271541558b6..f3747cdf5dd1 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -735,6 +735,10 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) ret = -ENOENT; goto out; } + if (cmd->index != index) { + dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)", + tag, index, cmd->index); + } if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); -- Gitee From fbbe3c9e906b0581edc4f28dcdcb65a4a7d47b11 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:43:02 +0800 Subject: [PATCH 059/134] nbd: don't start request if nbd_queue_rq() failed mainline inclusion from mainline-next-20211018 commit a83fdc85365586dc5c0f3ff91680e18e37a66f19 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a83fdc85365586dc5c0f3ff91680e18e37a66f19 Signed-off-by: Yu Changchun --------------------------- commit 6a468d5990ec ("nbd: don't start req until after the dead connection logic") move blk_mq_start_request() from nbd_queue_rq() to nbd_handle_cmd() to skip starting request if the connection is dead. However, request is still started in other error paths. Currently, blk_mq_end_request() will be called immediately if nbd_queue_rq() failed, thus start request in such situation is useless. So remove blk_mq_start_request() from error paths in nbd_handle_cmd(). Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210916093350.1410403-5-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f3747cdf5dd1..11a936151b93 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -923,7 +923,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) if (!refcount_inc_not_zero(&nbd->config_refs)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); - blk_mq_start_request(req); return -EINVAL; } config = nbd->config; @@ -932,7 +931,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted send on invalid socket\n"); nbd_config_put(nbd); - blk_mq_start_request(req); return -EINVAL; } cmd->status = BLK_STS_OK; @@ -956,7 +954,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) */ sock_shutdown(nbd); nbd_config_put(nbd); - blk_mq_start_request(req); return -EIO; } goto again; -- Gitee From a4720fbe0b1ad1dd170a7506fe6052d21ca9de16 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:43:03 +0800 Subject: [PATCH 060/134] nbd: clean up return value checking of sock_xmit() mainline inclusion from mainline-next-20211018 commit 6157a8f489909db00151a4e361903b9099b03b75 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6157a8f489909db00151a4e361903b9099b03b75 Signed-off-by: Yu Changchun --------------------------- Check if sock_xmit() return 0 is useless because it'll never return 0, comment it and remove such checkings. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210916093350.1410403-6-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 11a936151b93..8bfade1c4ee6 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -479,7 +479,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } /* - * Send or receive packet. + * Send or receive packet. Return a positive value on success and + * negtive value on failue, and never return 0. */ static int sock_xmit(struct nbd_device *nbd, int index, int send, struct iov_iter *iter, int msg_flags, int *sent) @@ -605,7 +606,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) result = sock_xmit(nbd, index, 1, &from, (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); trace_nbd_header_sent(req, handle); - if (result <= 0) { + if (result < 0) { if (was_interrupted(result)) { /* If we havne't sent anything we can just return BUSY, * however if we have sent something we need to make @@ -649,7 +650,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) skip = 0; } result = sock_xmit(nbd, index, 1, &from, flags, &sent); - if (result <= 0) { + if (result < 0) { if (was_interrupted(result)) { /* We've already sent the header, we * have no choice but to set pending and @@ -701,7 +702,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) reply.magic = 0; iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply)); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); - if (result <= 0) { + if (result < 0) { if (!nbd_disconnected(config)) dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); @@ -772,7 +773,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) rq_for_each_segment(bvec, req, iter) { iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); - if (result <= 0) { + if (result < 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); /* @@ -1218,7 +1219,7 @@ static void send_disconnects(struct nbd_device *nbd) iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); mutex_lock(&nsock->tx_lock); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); - if (ret <= 0) + if (ret < 0) dev_err(disk_to_dev(nbd->disk), "Send disconnect failed %d\n", ret); mutex_unlock(&nsock->tx_lock); -- Gitee From 2eb290002f0b6598c53c4c0269e8aca35c852735 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:43:04 +0800 Subject: [PATCH 061/134] nbd: partition nbd_read_stat() into nbd_read_reply() and nbd_handle_reply() mainline inclusion from mainline-next-20211018 commit 961e9f50be9bb47835b0ac7e08d55d2d0a45e493 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=961e9f50be9bb47835b0ac7e08d55d2d0a45e493 Signed-off-by: Yu Changchun --------------------------- Prepare to fix uaf in nbd_read_stat(), no functional changes. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210916093350.1410403-7-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 74 +++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 8bfade1c4ee6..622ea52ebbdd 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -684,38 +684,45 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) return 0; } -/* NULL returned = something went wrong, inform userspace */ -static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) +static int nbd_read_reply(struct nbd_device *nbd, int index, + struct nbd_reply *reply) { - struct nbd_config *config = nbd->config; - int result; - struct nbd_reply reply; - struct nbd_cmd *cmd; - struct request *req = NULL; - u64 handle; - u16 hwq; - u32 tag; - struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)}; + struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)}; struct iov_iter to; - int ret = 0; + int result; - reply.magic = 0; - iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply)); + reply->magic = 0; + iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply)); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); if (result < 0) { - if (!nbd_disconnected(config)) + if (!nbd_disconnected(nbd->config)) dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); - return ERR_PTR(result); + return result; } - if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { + if (ntohl(reply->magic) != NBD_REPLY_MAGIC) { dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", - (unsigned long)ntohl(reply.magic)); - return ERR_PTR(-EPROTO); + (unsigned long)ntohl(reply->magic)); + return -EPROTO; } - memcpy(&handle, reply.handle, sizeof(handle)); + return 0; +} + +/* NULL returned = something went wrong, inform userspace */ +static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, + struct nbd_reply *reply) +{ + int result; + struct nbd_cmd *cmd; + struct request *req = NULL; + u64 handle; + u16 hwq; + u32 tag; + int ret = 0; + + memcpy(&handle, reply->handle, sizeof(handle)); tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) @@ -758,9 +765,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) ret = -ENOENT; goto out; } - if (ntohl(reply.error)) { + if (ntohl(reply->error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", - ntohl(reply.error)); + ntohl(reply->error)); cmd->status = BLK_STS_IOERR; goto out; } @@ -769,6 +776,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; + struct iov_iter to; rq_for_each_segment(bvec, req, iter) { iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); @@ -782,7 +790,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) * and let the timeout stuff handle resubmitting * this request onto another connection. */ - if (nbd_disconnected(config)) { + if (nbd_disconnected(nbd->config)) { cmd->status = BLK_STS_IOERR; goto out; } @@ -806,24 +814,30 @@ static void recv_work(struct work_struct *work) work); struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; + struct nbd_sock *nsock; struct nbd_cmd *cmd; struct request *rq; while (1) { - cmd = nbd_read_stat(nbd, args->index); - if (IS_ERR(cmd)) { - struct nbd_sock *nsock = config->socks[args->index]; + struct nbd_reply reply; - mutex_lock(&nsock->tx_lock); - nbd_mark_nsock_dead(nbd, nsock, 1); - mutex_unlock(&nsock->tx_lock); + if (nbd_read_reply(nbd, args->index, &reply)) + break; + + cmd = nbd_handle_reply(nbd, args->index, &reply); + if (IS_ERR(cmd)) break; - } rq = blk_mq_rq_from_pdu(cmd); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } + + nsock = config->socks[args->index]; + mutex_lock(&nsock->tx_lock); + nbd_mark_nsock_dead(nbd, nsock, 1); + mutex_unlock(&nsock->tx_lock); + nbd_config_put(nbd); atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); -- Gitee From bf70aab1e49bbf60ea0731736dea81f307c4a3a3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:43:05 +0800 Subject: [PATCH 062/134] nbd: fix uaf in nbd_handle_reply() mainline inclusion from mainline-next-20211018 commit 52c90e0184f67eecb00b53b79bfdf75e0274f8fd category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=52c90e0184f67eecb00b53b79bfdf75e0274f8fd Signed-off-by: Yu Changchun --------------------------- There is a problem that nbd_handle_reply() might access freed request: 1) At first, a normal io is submitted and completed with scheduler: internal_tag = blk_mq_get_tag -> get tag from sched_tags blk_mq_rq_ctx_init sched_tags->rq[internal_tag] = sched_tag->static_rq[internal_tag] ... blk_mq_get_driver_tag __blk_mq_get_driver_tag -> get tag from tags tags->rq[tag] = sched_tag->static_rq[internal_tag] So, both tags->rq[tag] and sched_tags->rq[internal_tag] are pointing to the request: sched_tags->static_rq[internal_tag]. Even if the io is finished. 2) nbd server send a reply with random tag directly: recv_work nbd_handle_reply blk_mq_tag_to_rq(tags, tag) rq = tags->rq[tag] 3) if the sched_tags->static_rq is freed: blk_mq_sched_free_requests blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i) -> step 2) access rq before clearing rq mapping blk_mq_clear_rq_mapping(set, tags, hctx_idx); __free_pages() -> rq is freed here 4) Then, nbd continue to use the freed request in nbd_handle_reply Fix the problem by get 'q_usage_counter' before blk_mq_tag_to_rq(), thus request is ensured not to be freed because 'q_usage_counter' is not zero. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210916141810.2325276-1-yukuai3@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 622ea52ebbdd..051ba02f996f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -814,6 +814,7 @@ static void recv_work(struct work_struct *work) work); struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; + struct request_queue *q = nbd->disk->queue; struct nbd_sock *nsock; struct nbd_cmd *cmd; struct request *rq; @@ -824,13 +825,28 @@ static void recv_work(struct work_struct *work) if (nbd_read_reply(nbd, args->index, &reply)) break; + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during nbd_handle_reply(). + * If queue is frozen, there won't be any inflight requests, we + * needn't to handle the incoming garbage message. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) { + dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n", + __func__); + break; + } + cmd = nbd_handle_reply(nbd, args->index, &reply); - if (IS_ERR(cmd)) + if (IS_ERR(cmd)) { + percpu_ref_put(&q->q_usage_counter); break; + } rq = blk_mq_rq_from_pdu(cmd); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); + percpu_ref_put(&q->q_usage_counter); } nsock = config->socks[args->index]; -- Gitee From 4b423d41e21c49e97bb37de9224e85c5053090ad Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 15 Nov 2021 19:49:02 +0800 Subject: [PATCH 063/134] ext4: avoid recheck extent for EXT4_EX_FORCE_CACHE maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Buffer with verified means that it has been checked before. No need verify and call set_buffer_verified again. Signed-off-by: yangerkun Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 618675a41efb..a77e25ca6867 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -544,13 +544,16 @@ __read_extent_tree_block(const char *function, unsigned int line, if (err < 0) goto errout; } - if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) - return bh; - err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), - depth, pblk, le32_to_cpu(idx->ei_block)); - if (err) - goto errout; - set_buffer_verified(bh); + if (buffer_verified(bh)) { + if (!(flags & EXT4_EX_FORCE_CACHE)) + return bh; + } else { + err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), + depth, pblk, le32_to_cpu(idx->ei_block)); + if (err) + goto errout; + set_buffer_verified(bh); + } /* * If this is a leaf block, cache all of its entries */ -- Gitee From 20fb37766c29df27bf3b80bb39b0d5863c21643d Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 15 Nov 2021 19:49:03 +0800 Subject: [PATCH 064/134] ext4: check magic even the extent block bh is verified maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Our stress testing with IO error can trigger follow OOB with a very low probability. [59898.282466] BUG: KASAN: slab-out-of-bounds in ext4_find_extent+0x2e4/0x480 ... [59898.287162] Call Trace: [59898.287575] dump_stack+0x8b/0xb9 [59898.288070] print_address_description+0x73/0x280 [59898.289903] ext4_find_extent+0x2e4/0x480 [59898.290553] ext4_ext_map_blocks+0x125/0x1470 [59898.295481] ext4_map_blocks+0x5ee/0x940 [59898.315984] ext4_mpage_readpages+0x63c/0xdb0 [59898.320231] read_pages+0xe6/0x370 [59898.321589] __do_page_cache_readahead+0x233/0x2a0 [59898.321594] ondemand_readahead+0x157/0x450 [59898.321598] generic_file_read_iter+0xcb2/0x1550 [59898.328828] __vfs_read+0x233/0x360 [59898.328840] vfs_read+0xa5/0x190 [59898.330126] ksys_read+0xa5/0x150 [59898.331405] do_syscall_64+0x6d/0x1f0 [59898.331418] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Digging deep and we found it's actually a xattr block which can happened with follow steps: 1. extent update for file1 and will remove a leaf extent block(block A) 2. we need update the idx extent block too 3. block A has been allocated as a xattr block and will set verified 3. io error happened for this idx block and will the buffer has been released late 4. extent find for file1 will read the idx block and see block A again 5. since the buffer of block A is already verified, we will use it directly, which can lead the upper OOB Same as __ext4_xattr_check_block, we can check magic even the buffer is verified to fix the problem. Signed-off-by: yangerkun Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a77e25ca6867..74b69d83c198 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -545,6 +545,14 @@ __read_extent_tree_block(const char *function, unsigned int line, goto errout; } if (buffer_verified(bh)) { + if (unlikely(ext_block_hdr(bh)->eh_magic != EXT4_EXT_MAGIC)) { + err = -EFSCORRUPTED; + ext4_error_inode(inode, function, line, 0, + "invalid magic for verified extent block %llu", + (unsigned long long)bh->b_blocknr); + goto errout; + } + if (!(flags & EXT4_EX_FORCE_CACHE)) return bh; } else { -- Gitee From 4e2cb35a73aa5033f461926533cd96128253ecaa Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 15 Nov 2021 19:49:04 +0800 Subject: [PATCH 065/134] scsi: scsi_debug: Fix out-of-bound read in resp_readcap16() mainline inclusion from mainline-v5.17 commit 4e3ace0051e7e504b55d239daab8789dd89b863c category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4e3ace0051e7e504b55d239daab8789dd89b863c Signed-off-by: Yu Changchun ----------------------------------------------- The following warning was observed running syzkaller: [ 3813.830724] sg_write: data in/out 65466/242 bytes for SCSI command 0x9e-- guessing data in; [ 3813.830724] program syz-executor not setting count and/or reply_len properly [ 3813.836956] ================================================================== [ 3813.839465] BUG: KASAN: stack-out-of-bounds in sg_copy_buffer+0x157/0x1e0 [ 3813.841773] Read of size 4096 at addr ffff8883cf80f540 by task syz-executor/1549 [ 3813.846612] Call Trace: [ 3813.846995] dump_stack+0x108/0x15f [ 3813.847524] print_address_description+0xa5/0x372 [ 3813.848243] kasan_report.cold+0x236/0x2a8 [ 3813.849439] check_memory_region+0x240/0x270 [ 3813.850094] memcpy+0x30/0x80 [ 3813.850553] sg_copy_buffer+0x157/0x1e0 [ 3813.853032] sg_copy_from_buffer+0x13/0x20 [ 3813.853660] fill_from_dev_buffer+0x135/0x370 [ 3813.854329] resp_readcap16+0x1ac/0x280 [ 3813.856917] schedule_resp+0x41f/0x1630 [ 3813.858203] scsi_debug_queuecommand+0xb32/0x17e0 [ 3813.862699] scsi_dispatch_cmd+0x330/0x950 [ 3813.863329] scsi_request_fn+0xd8e/0x1710 [ 3813.863946] __blk_run_queue+0x10b/0x230 [ 3813.864544] blk_execute_rq_nowait+0x1d8/0x400 [ 3813.865220] sg_common_write.isra.0+0xe61/0x2420 [ 3813.871637] sg_write+0x6c8/0xef0 [ 3813.878853] __vfs_write+0xe4/0x800 [ 3813.883487] vfs_write+0x17b/0x530 [ 3813.884008] ksys_write+0x103/0x270 [ 3813.886268] __x64_sys_write+0x77/0xc0 [ 3813.886841] do_syscall_64+0x106/0x360 [ 3813.887415] entry_SYSCALL_64_after_hwframe+0x44/0xa9 This issue can be reproduced with the following syzkaller log: r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x26e1, 0x0) r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='fd/3\x00') open_by_handle_at(r1, &(0x7f00000003c0)=ANY=[@ANYRESHEX], 0x602000) r2 = syz_open_dev$sg(&(0x7f0000000000), 0x0, 0x40782) write$binfmt_aout(r2, &(0x7f0000000340)=ANY=[@ANYBLOB="00000000deff000000000000000000000000000000000000000000000000000047f007af9e107a41ec395f1bded7be24277a1501ff6196a83366f4e6362bc0ff2b247f68a972989b094b2da4fb3607fcf611a22dd04310d28c75039d"], 0x126) In resp_readcap16() we get "int alloc_len" value -1104926854, and then pass the huge arr_len to fill_from_dev_buffer(), but arr is only 32 bytes. This leads to OOB in sg_copy_buffer(). To solve this issue, define alloc_len as u32. Link: https://lore.kernel.org/r/20211013033913.2551004-2-yebin10@huawei.com Acked-by: Douglas Gilbert Signed-off-by: Ye Bin Signed-off-by: Martin K. Petersen Signed-off-by: Ye Bin Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/scsi/scsi_debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index b6540b92f566..63504dc63d87 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1855,7 +1855,7 @@ static int resp_readcap16(struct scsi_cmnd *scp, { unsigned char *cmd = scp->cmnd; unsigned char arr[SDEBUG_READCAP16_ARR_SZ]; - int alloc_len; + u32 alloc_len; alloc_len = get_unaligned_be32(cmd + 10); /* following just in case virtual_gb changed */ @@ -1884,7 +1884,7 @@ static int resp_readcap16(struct scsi_cmnd *scp, } return fill_from_dev_buffer(scp, arr, - min_t(int, alloc_len, SDEBUG_READCAP16_ARR_SZ)); + min_t(u32, alloc_len, SDEBUG_READCAP16_ARR_SZ)); } #define SDEBUG_MAX_TGTPGS_ARR_SZ 1412 -- Gitee From 87bbda91aeb83a6c601ef805a8315098b2fcd8a2 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 15 Nov 2021 19:49:05 +0800 Subject: [PATCH 066/134] scsi: scsi_debug: Fix out-of-bound read in resp_report_tgtpgs() mainline inclusion from mainline-v5.17 commit f347c26836c270199de1599c3cd466bb7747caa9 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f347c26836c270199de1599c3cd466bb7747caa9 Signed-off-by: Yu Changchun ----------------------------------------------- The following issue was observed running syzkaller: BUG: KASAN: slab-out-of-bounds in memcpy include/linux/string.h:377 [inline] BUG: KASAN: slab-out-of-bounds in sg_copy_buffer+0x150/0x1c0 lib/scatterlist.c:831 Read of size 2132 at addr ffff8880aea95dc8 by task syz-executor.0/9815 CPU: 0 PID: 9815 Comm: syz-executor.0 Not tainted 4.19.202-00874-gfc0fe04215a9 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xe4/0x14a lib/dump_stack.c:118 print_address_description+0x73/0x280 mm/kasan/report.c:253 kasan_report_error mm/kasan/report.c:352 [inline] kasan_report+0x272/0x370 mm/kasan/report.c:410 memcpy+0x1f/0x50 mm/kasan/kasan.c:302 memcpy include/linux/string.h:377 [inline] sg_copy_buffer+0x150/0x1c0 lib/scatterlist.c:831 fill_from_dev_buffer+0x14f/0x340 drivers/scsi/scsi_debug.c:1021 resp_report_tgtpgs+0x5aa/0x770 drivers/scsi/scsi_debug.c:1772 schedule_resp+0x464/0x12f0 drivers/scsi/scsi_debug.c:4429 scsi_debug_queuecommand+0x467/0x1390 drivers/scsi/scsi_debug.c:5835 scsi_dispatch_cmd+0x3fc/0x9b0 drivers/scsi/scsi_lib.c:1896 scsi_request_fn+0x1042/0x1810 drivers/scsi/scsi_lib.c:2034 __blk_run_queue_uncond block/blk-core.c:464 [inline] __blk_run_queue+0x1a4/0x380 block/blk-core.c:484 blk_execute_rq_nowait+0x1c2/0x2d0 block/blk-exec.c:78 sg_common_write.isra.19+0xd74/0x1dc0 drivers/scsi/sg.c:847 sg_write.part.23+0x6e0/0xd00 drivers/scsi/sg.c:716 sg_write+0x64/0xa0 drivers/scsi/sg.c:622 __vfs_write+0xed/0x690 fs/read_write.c:485 kill_bdev:block_device:00000000e138492c vfs_write+0x184/0x4c0 fs/read_write.c:549 ksys_write+0x107/0x240 fs/read_write.c:599 do_syscall_64+0xc2/0x560 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe We get 'alen' from command its type is int. If userspace passes a large length we will get a negative 'alen'. Switch n, alen, and rlen to u32. Link: https://lore.kernel.org/r/20211013033913.2551004-3-yebin10@huawei.com Acked-by: Douglas Gilbert Signed-off-by: Ye Bin Signed-off-by: Martin K. Petersen Signed-off-by: Ye Bin Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/scsi/scsi_debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 63504dc63d87..3fc7c2a31c19 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1895,8 +1895,9 @@ static int resp_report_tgtpgs(struct scsi_cmnd *scp, unsigned char *cmd = scp->cmnd; unsigned char *arr; int host_no = devip->sdbg_host->shost->host_no; - int n, ret, alen, rlen; int port_group_a, port_group_b, port_a, port_b; + u32 alen, n, rlen; + int ret; alen = get_unaligned_be32(cmd + 6); arr = kzalloc(SDEBUG_MAX_TGTPGS_ARR_SZ, GFP_ATOMIC); @@ -1958,9 +1959,9 @@ static int resp_report_tgtpgs(struct scsi_cmnd *scp, * - The constructed command length * - The maximum array size */ - rlen = min_t(int, alen, n); + rlen = min(alen, n); ret = fill_from_dev_buffer(scp, arr, - min_t(int, rlen, SDEBUG_MAX_TGTPGS_ARR_SZ)); + min_t(u32, rlen, SDEBUG_MAX_TGTPGS_ARR_SZ)); kfree(arr); return ret; } -- Gitee From d04b23f6319bca4a54c288cdfdbf0a9c28dc9496 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 15 Nov 2021 19:49:06 +0800 Subject: [PATCH 067/134] PM: hibernate: Get block device exclusively in swsusp_check() mainline inclusion from mainline-v5.17 commit 39fbef4b0f77f9c89c8f014749ca533643a37c9f category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=39fbef4b0f77f9c89c8f014749ca533643a37c9f Signed-off-by: Yu Changchun ----------------------------------------------- The following kernel crash can be triggered: [ 89.266592] ------------[ cut here ]------------ [ 89.267427] kernel BUG at fs/buffer.c:3020! [ 89.268264] invalid opcode: 0000 [#1] SMP KASAN PTI [ 89.269116] CPU: 7 PID: 1750 Comm: kmmpd-loop0 Not tainted 5.10.0-862.14.0.6.x86_64-08610-gc932cda3cef4-dirty #20 [ 89.273169] RIP: 0010:submit_bh_wbc.isra.0+0x538/0x6d0 [ 89.277157] RSP: 0018:ffff888105ddfd08 EFLAGS: 00010246 [ 89.278093] RAX: 0000000000000005 RBX: ffff888124231498 RCX: ffffffffb2772612 [ 89.279332] RDX: 1ffff11024846293 RSI: 0000000000000008 RDI: ffff888124231498 [ 89.280591] RBP: ffff8881248cc000 R08: 0000000000000001 R09: ffffed1024846294 [ 89.281851] R10: ffff88812423149f R11: ffffed1024846293 R12: 0000000000003800 [ 89.283095] R13: 0000000000000001 R14: 0000000000000000 R15: ffff8881161f7000 [ 89.284342] FS: 0000000000000000(0000) GS:ffff88839b5c0000(0000) knlGS:0000000000000000 [ 89.285711] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 89.286701] CR2: 00007f166ebc01a0 CR3: 0000000435c0e000 CR4: 00000000000006e0 [ 89.287919] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 89.289138] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 89.290368] Call Trace: [ 89.290842] write_mmp_block+0x2ca/0x510 [ 89.292218] kmmpd+0x433/0x9a0 [ 89.294902] kthread+0x2dd/0x3e0 [ 89.296268] ret_from_fork+0x22/0x30 [ 89.296906] Modules linked in: by running the following commands: 1. mkfs.ext4 -O mmp /dev/sda -b 1024 2. mount /dev/sda /home/test 3. echo "/dev/sda" > /sys/power/resume That happens because swsusp_check() calls set_blocksize() on the target partition which confuses the file system: Thread1 Thread2 mount /dev/sda /home/test get s_mmp_bh --> has mapped flag start kmmpd thread echo "/dev/sda" > /sys/power/resume resume_store software_resume swsusp_check set_blocksize truncate_inode_pages_range truncate_cleanup_page block_invalidatepage discard_buffer --> clean mapped flag write_mmp_block submit_bh submit_bh_wbc BUG_ON(!buffer_mapped(bh)) To address this issue, modify swsusp_check() to open the target block device with exclusive access. Signed-off-by: Ye Bin [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Ye Bin Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/power/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 72e33054a2e1..c9126606fa6f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1521,9 +1521,10 @@ int swsusp_read(unsigned int *flags_p) int swsusp_check(void) { int error; + void *holder; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ, NULL); + FMODE_READ | FMODE_EXCL, &holder); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1545,7 +1546,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ); + blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); else pr_debug("Image signature found, resuming\n"); } else { -- Gitee From ef9d20b69f202dc9003424ebe9fdb10df41ed52f Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 15 Nov 2021 19:49:07 +0800 Subject: [PATCH 068/134] nbd: Fix use-after-free in pid_show mainline inclusion from mainline-v5.16 commit 0c98057be9efa32de78dbc4685fc73da9d71faa1 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c98057be9efa32de78dbc4685fc73da9d71faa1 Signed-off-by: Yu Changchun ----------------------------------------------- I got issue as follows: [ 263.886511] BUG: KASAN: use-after-free in pid_show+0x11f/0x13f [ 263.888359] Read of size 4 at addr ffff8880bf0648c0 by task cat/746 [ 263.890479] CPU: 0 PID: 746 Comm: cat Not tainted 4.19.90-dirty #140 [ 263.893162] Call Trace: [ 263.893509] dump_stack+0x108/0x15f [ 263.893999] print_address_description+0xa5/0x372 [ 263.894641] kasan_report.cold+0x236/0x2a8 [ 263.895696] __asan_report_load4_noabort+0x25/0x30 [ 263.896365] pid_show+0x11f/0x13f [ 263.897422] dev_attr_show+0x48/0x90 [ 263.898361] sysfs_kf_seq_show+0x24d/0x4b0 [ 263.899479] kernfs_seq_show+0x14e/0x1b0 [ 263.900029] seq_read+0x43f/0x1150 [ 263.900499] kernfs_fop_read+0xc7/0x5a0 [ 263.903764] vfs_read+0x113/0x350 [ 263.904231] ksys_read+0x103/0x270 [ 263.905230] __x64_sys_read+0x77/0xc0 [ 263.906284] do_syscall_64+0x106/0x360 [ 263.906797] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reproduce this issue as follows: 1. nbd-server 8000 /tmp/disk 2. nbd-client localhost 8000 /dev/nbd1 3. cat /sys/block/nbd1/pid Then trigger use-after-free in pid_show. Reason is after do step '2', nbd-client progress is already exit. So it's task_struct already freed. To solve this issue, revert part of 6521d39a64b3's modify and remove useless 'recv_task' member of nbd_device. Fixes: 6521d39a64b3 ("nbd: Remove variable 'pid'") Signed-off-by: Ye Bin Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20211020073959.2679255-1-yebin10@huawei.com Signed-off-by: Jens Axboe conflicts: drivers/block/nbd.c Signed-off-by: Ye Bin Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 051ba02f996f..f3896f1c27f4 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -114,11 +114,11 @@ struct nbd_device { struct workqueue_struct *recv_workq; struct list_head list; - struct task_struct *task_recv; struct task_struct *task_setup; struct completion *destroy_complete; unsigned long flags; + pid_t pid; /* pid of nbd-client, if attached */ }; #define NBD_CMD_REQUEUED 1 @@ -214,7 +214,7 @@ static ssize_t pid_show(struct device *dev, struct gendisk *disk = dev_to_disk(dev); struct nbd_device *nbd = (struct nbd_device *)disk->private_data; - return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); + return sprintf(buf, "%d\n", nbd->pid); } static const struct device_attribute pid_attr = { @@ -333,7 +333,7 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, struct nbd_config *config = nbd->config; config->blksize = blocksize; config->bytesize = blocksize * nr_blocks; - if (nbd->task_recv != NULL) + if (nbd->pid) nbd_size_update(nbd, false); } @@ -1284,7 +1284,7 @@ static void nbd_config_put(struct nbd_device *nbd) if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - nbd->task_recv = NULL; + nbd->pid = 0; nbd_clear_sock(nbd); if (config->num_connections) { int i; @@ -1319,7 +1319,7 @@ static int nbd_start_device(struct nbd_device *nbd) int num_connections = config->num_connections; int error = 0, i; - if (nbd->task_recv) + if (nbd->pid) return -EBUSY; if (!config->socks) return -EINVAL; @@ -1338,7 +1338,7 @@ static int nbd_start_device(struct nbd_device *nbd) } blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); - nbd->task_recv = current; + nbd->pid = task_pid_nr(current); nbd_parse_flags(nbd); @@ -1612,8 +1612,8 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) { struct nbd_device *nbd = s->private; - if (nbd->task_recv) - seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv)); + if (nbd->pid) + seq_printf(s, "recv: %d\n", nbd->pid); return 0; } @@ -2188,7 +2188,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) mutex_lock(&nbd->config_lock); config = nbd->config; if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || - !nbd->task_recv) { + !nbd->pid) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); ret = -EINVAL; -- Gitee From f6278454bd9d4d1218614d1ea809a753fc05d607 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Mon, 15 Nov 2021 19:49:08 +0800 Subject: [PATCH 069/134] Fix NULL pointer dereference in handling for passthrough commands maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- In passthrough path. If the command size for ioctl request from userspace is 0. The original process will get cmd_len from cmd->cmnd, but It has not been assigned at this time. So it will trigger a NULL pointer BUG. ------------[ cut here ]------------ BUG: kernel NULL pointer dereference, address: 0000000000000000 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page RIP: 0010:scsi_queue_rq+0xcb2/0x12b0 Call Trace: blk_mq_dispatch_rq_list+0x541/0xe90 __blk_mq_sched_dispatch_requests+0x1fe/0x2b0 blk_mq_sched_dispatch_requests+0xbf/0x130 __blk_mq_run_hw_queue+0x15b/0x230 __blk_mq_delay_run_hw_queue+0x18f/0x320 blk_mq_run_hw_queue+0x252/0x280 blk_mq_sched_insert_request+0x228/0x260 blk_execute_rq+0x111/0x160 sg_io+0x51a/0x740 scsi_cmd_ioctl+0x533/0x910 scsi_cmd_blk_ioctl+0xa1/0xb0 cdrom_ioctl+0x3f/0x2510 sr_block_ioctl+0x142/0x180 blkdev_ioctl+0x398/0x450 block_ioctl+0x6d/0x80 __se_sys_ioctl+0xd1/0x140 __x64_sys_ioctl+0x3f/0x50 do_syscall_64+0x37/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 We can trigger front BUG by ioctl blow. ------------[ cut here ]------------ sg_io_hdr_t *addr; addr = malloc(sizeof(sg_io_hdr_t)); memset(addr, 0, sizeof(sg_io_hdr_t)); addr->interface_id = 'S'; fd = open(/dev/sr0, O_RDONLY); // open a CD_ROM dev ioctl(fd, SG_IO, addr); // all zero sg_io_hdr_t will trigger this bug Fixes: 2ceda20f0a99a ("scsi: core: Move command size detection out of the fast path") Signed-off-by: Laibin Qiu Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/scsi/scsi_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d89db29fa829..b26c922bd65d 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1193,9 +1193,9 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, } cmd->cmd_len = scsi_req(req)->cmd_len; + cmd->cmnd = scsi_req(req)->cmd; if (cmd->cmd_len == 0) cmd->cmd_len = scsi_command_size(cmd->cmnd); - cmd->cmnd = scsi_req(req)->cmd; cmd->transfersize = blk_rq_bytes(req); cmd->allowed = scsi_req(req)->retries; return BLK_STS_OK; -- Gitee From 4fa4dba9c1cbf355fdfead50c2006e5917f6055f Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Mon, 15 Nov 2021 19:49:09 +0800 Subject: [PATCH 070/134] loop: fix loop_validate_block_size() can't make sense maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- We config the block size of a loop device throuth the following process: lo_ioctl(..., unsigned long arg) | ^^^^ lo_simple_ioctl(..., unsigned long arg) | ^^^^ loop_set_block_size(..., unsigned long arg) | ^^^^ loop_validate_block_size(unsigned short bsize) | ^^^^^ blk_queue_logical_block_size(..., unsigned int size) { ''' limits->logical_block_size = size; ^^^^ int ''' } loop_validate_block_size() will check the validity of bsize. But long to short will be truncated(eg arg=1049600 and it becomes 1024 after truncation by short. The block size must within the range of 512 ~ PAGE_SZIE, This truncation will turn illegal block szie to legal). The wrong of block size will raise other errors. Fixes: 3448914e8cc55 ("loop: Add LOOP_CONFIGURE ioctl") Signed-off-by: Laibin Qiu Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f0fa0c8e7ec6..5dd8bd480e29 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -233,7 +233,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) * @bsize: size to validate */ static int -loop_validate_block_size(unsigned short bsize) +loop_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) return -EINVAL; -- Gitee From daecdc92a6e81384346aea6558c1024fadcbfe49 Mon Sep 17 00:00:00 2001 From: Zheng Liang Date: Mon, 15 Nov 2021 19:50:16 +0800 Subject: [PATCH 071/134] block, bfq: fix UAF problem in bfqg_stats_init() mainline inclusion from mainline commit 2fc428f6b7ca80794cb9928c90d4de524366659f category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2fc428f6b7ca80794cb9928c90d4de524366659f Signed-off-by: Yu Changchun ------------------------------------------------- In bfq_pd_alloc(), the function bfqg_stats_init() init bfqg. If blkg_rwstat_init() init bfqg_stats->bytes successful and init bfqg_stats->ios failed, bfqg_stats_init() return failed, bfqg will be freed. But blkg_rwstat->cpu_cnt is not deleted from the list of percpu_counters. If we traverse the list of percpu_counters, It will have UAF problem. we should use blkg_rwstat_exit() to cleanup bfqg_stats bytes in the above scenario. Fixes: commit fd41e60331b ("bfq-iosched: stop using blkg->stat_bytes and ->stat_ios") Signed-off-by: Zheng Liang Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20211018024225.1493938-1-zhengliang6@huawei.com Signed-off-by: Jens Axboe Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/bfq-cgroup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b791e2041e49..a6bcc779c912 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) - return -ENOMEM; + goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || @@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || - bfq_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } + bfq_stat_init(&stats->empty_time, gfp)) + goto error; #endif return 0; + +error: + bfqg_stats_exit(stats); + return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -- Gitee From 7ec45656ed1dc5ccea18512c208da6e77ddab701 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 15 Nov 2021 19:50:17 +0800 Subject: [PATCH 072/134] ext4: ensure enough credits in ext4_ext_shift_path_extents maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Like ext4_ext_rm_leaf, we can ensure that there are enough credits before every call that will consume credits. As part of this fix we fold the functionality of ext4_access_path() into ext4_ext_shift_path_extents(). This change is needed as a preparation for the next bugfix patch. Cc: stable@kernel.org Link: https://lore.kernel.org/r/20210903062748.4118886-3-yangerkun@huawei.com Signed-off-by: yangerkun Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 49 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74b69d83c198..a16be67a1fe8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5014,36 +5014,6 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); } -/* - * ext4_access_path: - * Function to access the path buffer for marking it dirty. - * It also checks if there are sufficient credits left in the journal handle - * to update path. - */ -static int -ext4_access_path(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) -{ - int credits, err; - - if (!ext4_handle_valid(handle)) - return 0; - - /* - * Check if need to extend journal credits - * 3 for leaf, sb, and inode plus 2 (bmap and group - * descriptor) for each block group; assume two block - * groups - */ - credits = ext4_writepage_trans_blocks(inode); - err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0); - if (err < 0) - return err; - - err = ext4_ext_get_access(handle, inode, path); - return err; -} - /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext @@ -5058,6 +5028,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = false; + int credits, restart_credits; depth = path->p_depth; while (depth >= 0) { @@ -5067,13 +5038,23 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + /* leaf + sb + inode */ + credits = 3; + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { + update = true; + /* extent tree + sb + inode */ + credits = depth + 2; + } - err = ext4_access_path(handle, inode, path + depth); + restart_credits = ext4_writepage_trans_blocks(inode); + err = ext4_datasem_ensure_credits(handle, inode, credits, + restart_credits, 0); if (err) goto out; - if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) - update = true; + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { @@ -5104,7 +5085,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, } /* Update index too */ - err = ext4_access_path(handle, inode, path + depth); + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; -- Gitee From 1de907a470a804238764822c1dee2a00100e3eab Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 15 Nov 2021 19:50:18 +0800 Subject: [PATCH 073/134] ext4: refresh the ext4_ext_path struct after dropping i_data_sem. maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- After we drop i_data sem, we need to reload the ext4_ext_path structure since the extent tree can change once i_data_sem is released. This addresses the BUG: [52117.465187] ------------[ cut here ]------------ [52117.465686] kernel BUG at fs/ext4/extents.c:1756! ... [52117.478306] Call Trace: [52117.478565] ext4_ext_shift_extents+0x3ee/0x710 [52117.479020] ext4_fallocate+0x139c/0x1b40 [52117.479405] ? __do_sys_newfstat+0x6b/0x80 [52117.479805] vfs_fallocate+0x151/0x4b0 [52117.480177] ksys_fallocate+0x4a/0xa0 [52117.480533] __x64_sys_fallocate+0x22/0x30 [52117.480930] do_syscall_64+0x35/0x80 [52117.481277] entry_SYSCALL_64_after_hwframe+0x44/0xae [52117.481769] RIP: 0033:0x7fa062f855ca Cc: stable@kernel.org Link: https://lore.kernel.org/r/20210903062748.4118886-4-yangerkun@huawei.com Signed-off-by: yangerkun Signed-off-by: Theodore Ts'o Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a16be67a1fe8..f1548e52496c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5049,8 +5049,11 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, restart_credits = ext4_writepage_trans_blocks(inode); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); - if (err) + if (err) { + if (err > 0) + err = -EAGAIN; goto out; + } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -5124,6 +5127,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; + ext4_lblk_t tmp = EXT_MAX_BLOCKS; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, @@ -5177,11 +5181,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ +again: if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop; + if (tmp != EXT_MAX_BLOCKS) + *iterator = tmp; + /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator @@ -5210,6 +5218,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, } } + tmp = *iterator; if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + @@ -5228,6 +5237,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); + /* iterator can be NULL which means we should break */ + if (ret == -EAGAIN) + goto again; if (ret) break; } -- Gitee From 846ad258d3d81cac41a2b9e5f91449b81d7de615 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 15 Nov 2021 19:50:20 +0800 Subject: [PATCH 074/134] blk-mq: Introduce the BLK_MQ_F_NO_SCHED_BY_DEFAULT flag mainline inclusion from mainline-5.15-rc1 commit 90b7198001f23ea37d3b46dc631bdaa2357a20b1 category: performance issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=90b7198001f23ea37d3b46dc631bdaa2357a20b1 Signed-off-by: Yu Changchun --------------------------- elevator_get_default() uses the following algorithm to select an I/O scheduler from inside add_disk(): - In case of a single hardware queue or if sharing hardware queues across multiple request queues (BLK_MQ_F_TAG_HCTX_SHARED), use mq-deadline. - Otherwise, use 'none'. This is a good choice for most but not for all block drivers. Make it possible to override the selection of mq-deadline with a new flag, namely BLK_MQ_F_NO_SCHED_BY_DEFAULT. Cc: Christoph Hellwig Cc: Ming Lei Cc: Tetsuo Handa Cc: Martijn Coenen Cc: Jaegeuk Kim Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210805174200.3250718-2-bvanassche@acm.org Signed-off-by: Jens Axboe Conflicts: block/elevator.c Signed-off-by: yangerkun Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/elevator.c | 3 +++ include/linux/blk-mq.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index 2a525863d4e9..4ce6b22813a1 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -624,6 +624,9 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + if (q->nr_hw_queues != 1) return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 92bbc9a72355..eee2c8a16601 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -399,7 +399,13 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, + /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, + /* + * Select 'none' during queue registration in case of a single hwq + * or shared hwqs instead of 'mq-deadline'. + */ + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, -- Gitee From df8dbd6a2d544b655f74b385a8d18f9a5f78b15d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 15 Nov 2021 19:50:21 +0800 Subject: [PATCH 075/134] loop: Select I/O scheduler 'none' from inside add_disk() mainline inclusion from mainline-5.15-rc1 commit 2112f5c1330a671fa852051d85cb9eadc05d7eb7 category: performance issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2112f5c1330a671fa852051d85cb9eadc05d7eb7 Signed-off-by: Yu Changchun --------------------------- We noticed that the user interface of Android devices becomes very slow under memory pressure. This is because Android uses the zram driver on top of the loop driver for swapping, because under memory pressure the swap code alternates reads and writes quickly, because mq-deadline is the default scheduler for loop devices and because mq-deadline delays writes by five seconds for such a workload with default settings. Fix this by making the kernel select I/O scheduler 'none' from inside add_disk() for loop devices. This default can be overridden at any time from user space, e.g. via a udev rule. This approach has an advantage compared to changing the I/O scheduler from userspace from 'mq-deadline' into 'none', namely that synchronize_rcu() does not get called. This patch changes the default I/O scheduler for loop devices from 'mq-deadline' into 'none'. Additionally, this patch reduces the Android boot time on my test setup with 0.5 seconds compared to configuring the loop I/O scheduler from user space. Cc: Christoph Hellwig Cc: Ming Lei Cc: Tetsuo Handa Cc: Martijn Coenen Cc: Jaegeuk Kim Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210805174200.3250718-3-bvanassche@acm.org Signed-off-by: Jens Axboe Signed-off-by: yangerkun Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5dd8bd480e29..84f3191f4566 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2117,7 +2117,8 @@ static int loop_add(struct loop_device **l, int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); -- Gitee From 79c8108f41abe1c508986a3aceced9884809cc8e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 15 Nov 2021 19:50:22 +0800 Subject: [PATCH 076/134] ext4: if zeroout fails fall back to splitting the extent node mainline inclusion from mainline-5.15-rc1 commit 308c57ccf4318236be75dfa251c84713e694457b category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=308c57ccf4318236be75dfa251c84713e694457b Signed-off-by: Yu Changchun --------------------------- If the underlying storage device is using thin-provisioning, it's possible for a zeroout operation to return ENOSPC. Commit df22291ff0fd ("ext4: Retry block allocation if we have free blocks left") added logic to retry block allocation since we might get free block after we commit a transaction. But the ENOSPC from thin-provisioning will confuse ext4, and lead to an infinite loop. Since using zeroout instead of splitting the extent node is an optimization, if it fails, we might as well fall back to splitting the extent node. Reported-by: yangerkun Signed-off-by: Theodore Ts'o Signed-off-by: yangerkun Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/extents.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1548e52496c..398df993ccd8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3612,7 +3612,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) - goto out; + goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < @@ -3626,7 +3626,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) - goto out; + goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; @@ -3635,6 +3635,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } +fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) -- Gitee From d811d9768edcb4cd1ae013f2bcd27f20b979fef5 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 15 Nov 2021 19:50:23 +0800 Subject: [PATCH 077/134] ovl: fix use after free in struct ovl_aio_req maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Example for triggering use after free in a overlay on ext4 setup: aio_read ovl_read_iter vfs_iter_read ext4_file_read_iter ext4_dio_read_iter iomap_dio_rw -> -EIOCBQUEUED /* * Here IO is completed in a separate thread, * ovl_aio_cleanup_handler() frees aio_req which has iocb embedded */ file_accessed(iocb->ki_filp); /**BOOM**/ Fix by introducing a refcount in ovl_aio_req similarly to aio_kiocb. This guarantees that iocb is only freed after vfs_read/write_iter() returns on underlying fs. Fixes: 2406a307ac7d ("ovl: implement async IO routines") Signed-off-by: yangerkun Link: https://lore.kernel.org/r/20210930032228.3199690-3-yangerkun@huawei.com/ Cc: # v5.6 Signed-off-by: Miklos Szeredi Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/overlayfs/file.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index f7135777cb4e..d1ae643a999a 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -17,6 +17,7 @@ struct ovl_aio_req { struct kiocb iocb; + refcount_t ref; struct kiocb *orig_iocb; struct fd fd; }; @@ -257,6 +258,14 @@ static rwf_t ovl_iocb_to_rwf(int ifl) return flags; } +static inline void ovl_aio_put(struct ovl_aio_req *aio_req) +{ + if (refcount_dec_and_test(&aio_req->ref)) { + fdput(aio_req->fd); + kmem_cache_free(ovl_aio_request_cachep, aio_req); + } +} + static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) { struct kiocb *iocb = &aio_req->iocb; @@ -273,8 +282,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) } orig_iocb->ki_pos = iocb->ki_pos; - fdput(aio_req->fd); - kmem_cache_free(ovl_aio_request_cachep, aio_req); + ovl_aio_put(aio_req); } static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) @@ -324,7 +332,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, real.file); aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) ovl_aio_cleanup_handler(aio_req); } @@ -395,7 +405,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) kiocb_clone(&aio_req->iocb, iocb, real.file); aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) ovl_aio_cleanup_handler(aio_req); } -- Gitee From c3f9ad79e2f317e9a72e7e541d5b3624a7025c05 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Nov 2021 19:50:24 +0800 Subject: [PATCH 078/134] net: make free_netdev() more lenient with unregistering devices mainline inclusion from mainline-v5.11-rc4 commit c269a24ce057abfc31130960e96ab197ef6ab196 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c269a24ce057abfc31130960e96ab197ef6ab196 Signed-off-by: Yu Changchun ------------------------------------------------- There are two flavors of handling netdev registration: - ones called without holding rtnl_lock: register_netdev() and unregister_netdev(); and - those called with rtnl_lock held: register_netdevice() and unregister_netdevice(). While the semantics of the former are pretty clear, the same can't be said about the latter. The netdev_todo mechanism is utilized to perform some of the device unregistering tasks and it hooks into rtnl_unlock() so the locked variants can't actually finish the work. In general free_netdev() does not mix well with locked calls. Most drivers operating under rtnl_lock set dev->needs_free_netdev to true and expect core to make the free_netdev() call some time later. The part where this becomes most problematic is error paths. There is no way to unwind the state cleanly after a call to register_netdevice(), since unreg can't be performed fully without dropping locks. Make free_netdev() more lenient, and defer the freeing if device is being unregistered. This allows error paths to simply call free_netdev() both after register_netdevice() failed, and after a call to unregister_netdevice() but before dropping rtnl_lock. Simplify the error paths which are currently doing gymnastics around free_netdev() handling. Signed-off-by: Jakub Kicinski Signed-off-by: Xu Jia Reviewed-by: Yue Haibing Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- net/8021q/vlan.c | 4 +--- net/core/dev.c | 11 +++++++++++ net/core/rtnetlink.c | 23 ++++++----------------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 15bbfaf943fd..8b644113715e 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -284,9 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) return 0; out_free_newdev: - if (new_dev->reg_state == NETREG_UNINITIALIZED || - new_dev->reg_state == NETREG_UNREGISTERED) - free_netdev(new_dev); + free_netdev(new_dev); return err; } diff --git a/net/core/dev.c b/net/core/dev.c index ae237a498569..b0c3dd268913 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10661,6 +10661,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n; might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 27ffa83ffeb3..e08b506163f5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3438,26 +3438,15 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev->ifindex = ifm->ifi_index; - if (ops->newlink) { + if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED || - dev->reg_state == NETREG_UNREGISTERED) - free_netdev(dev); - goto out; - } - } else { + else err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } + if (err < 0) { + free_netdev(dev); + goto out; } + err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; -- Gitee From 1bb3e6b185c454190f81d6f3a8f34fcbe49f0f0c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Nov 2021 19:50:25 +0800 Subject: [PATCH 079/134] bpf: Add ambient BPF runtime context stored in current mainline inclusion from mainline-v5.15-rc1 commit c7603cfa04e7c3a435b31d065f7cbdc829428f6e category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c7603cfa04e7c3a435b31d065f7cbdc829428f6e Signed-off-by: Yu Changchun -------------------------------- b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed the problem with cgroup-local storage use in BPF by pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate possible BPF program preemptions and nested executions. While this seems to work good in practice, it introduces new and unnecessary failure mode in which not all BPF programs might be executed if we fail to find an unused slot for cgroup storage, however unlikely it is. It might also not be so unlikely when/if we allow sleepable cgroup BPF programs in the future. Further, the way that cgroup storage is implemented as ambiently-available property during entire BPF program execution is a convenient way to pass extra information to BPF program and helpers without requiring user code to pass around extra arguments explicitly. So it would be good to have a generic solution that can allow implementing this without arbitrary restrictions. Ideally, such solution would work for both preemptable and sleepable BPF programs in exactly the same way. This patch introduces such solution, bpf_run_ctx. It adds one pointer field (bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of macros in such a way that it always stays valid throughout BPF program execution. BPF program preemption is handled by remembering previous current->bpf_ctx value locally while executing nested BPF program and restoring old value after nested BPF program finishes. This is handled by two helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are supposed to be used before and after BPF program runs, respectively. Restoring old value of the pointer handles preemption, while bpf_run_ctx pointer being a property of current task_struct naturally solves this problem for sleepable BPF programs by "following" BPF program execution as it is scheduled in and out of CPU. It would even allow CPU migration of BPF programs, even though it's not currently allowed by BPF infra. This patch cleans up cgroup local storage handling as a first application. The design itself is generic, though, with bpf_run_ctx being an empty struct that is supposed to be embedded into a specific struct for a given BPF program type (bpf_cg_run_ctx in this case). Follow up patches are planned that will expand this mechanism for other uses within tracing BPF programs. To verify that this change doesn't revert the fix to the original cgroup storage issue, I ran the same repro as in the original report ([0]) and didn't get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work). [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/ Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org Conflicts: include/linux/bpf.h include/linux/sched.h kernel/fork.c net/bpf/test_run.c Signed-off-by: Pu Lehui Reviewed-by: Kuohai Xu Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/bpf-cgroup.h | 54 -------------------------------------- include/linux/bpf.h | 45 ++++++++++++++++++++++--------- include/linux/sched.h | 5 ++++ kernel/bpf/helpers.c | 16 ++++------- kernel/bpf/local_storage.c | 3 --- kernel/fork.c | 3 +++ net/bpf/test_run.c | 22 ++++++++-------- 7 files changed, 57 insertions(+), 91 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 91b966978541..5120ca319ff4 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -27,19 +27,6 @@ struct task_struct; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) -#define BPF_CGROUP_STORAGE_NEST_MAX 8 - -struct bpf_cgroup_storage_info { - struct task_struct *task; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; -}; - -/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks - * to use bpf cgroup storage simultaneously. - */ -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -167,44 +154,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( return BPF_CGROUP_STORAGE_SHARED; } -static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - enum bpf_cgroup_storage_type stype; - int i, err = 0; - - preempt_disable(); - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, current); - for_each_cgroup_storage_type(stype) - this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], - storage[stype]); - goto out; - } - err = -EBUSY; - WARN_ON_ONCE(1); - -out: - preempt_enable(); - return err; -} - -static inline void bpf_cgroup_storage_unset(void) -{ - int i; - - for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); - return; - } -} - struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); @@ -450,9 +399,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } -static inline int bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } -static inline void bpf_cgroup_storage_unset(void) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 474a0d852614..88245386a4b6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1081,11 +1081,20 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); +struct bpf_run_ctx {}; + +struct bpf_cg_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_prog_array_item *prog_item; +}; + #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ migrate_disable(); \ rcu_read_lock(); \ @@ -1093,17 +1102,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, if (unlikely(check_non_null && !_array))\ goto _out; \ _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (!set_cg_storage) { \ - _ret &= func(_prog, ctx); \ - } else { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - _ret &= func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ - } \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ + while ((_prog = READ_ONCE(_item->prog))) { \ + run_ctx.prog_item = _item; \ + _ret &= func(_prog, ctx); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ _out: \ rcu_read_unlock(); \ migrate_enable(); \ @@ -1137,6 +1142,8 @@ _out: \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 ret; \ u32 _ret = 1; \ u32 _cn = 0; \ @@ -1144,15 +1151,15 @@ _out: \ rcu_read_lock(); \ _array = rcu_dereference(array); \ _item = &_array->items[0]; \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ + run_ctx.prog_item = _item; \ ret = func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ _ret &= (ret & 1); \ _cn |= (ret & 2); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ rcu_read_unlock(); \ migrate_enable(); \ if (_ret) \ @@ -1202,6 +1209,20 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx; + + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ + current->bpf_ctx = old_ctx; +} + extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/sched.h b/include/linux/sched.h index b85b26d9ccef..53198ac3d154 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -40,6 +40,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1340,6 +1341,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; +#endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0efe7c7bfe5e..bb4350de9f11 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -372,8 +372,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -382,17 +380,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; void *ptr; - int i; - for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); - break; - } + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index b139247d2dd3..49ca8771d470 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -11,9 +11,6 @@ #ifdef CONFIG_CGROUP_BPF -DEFINE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/kernel/fork.c b/kernel/fork.c index f019852b2e61..39b1783a7613 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2072,6 +2072,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + p->bpf_ctx = NULL; +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb684f31fd69..99712d35e535 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -19,18 +19,20 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; int ret = 0; u32 i; for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -41,18 +43,15 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, rcu_read_lock(); migrate_disable(); time_start = ktime_get_ns(); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); for (i = 0; i < repeat; i++) { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; + run_ctx.prog_item = &item; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); - bpf_cgroup_storage_unset(); - if (signal_pending(current)) { ret = -EINTR; break; @@ -70,6 +69,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); } } + bpf_reset_run_ctx(old_ctx); time_spent += ktime_get_ns() - time_start; migrate_enable(); rcu_read_unlock(); @@ -78,7 +78,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } -- Gitee From 79a5b091735baf1f3580e94fa623b9683b958fe4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:50:26 +0800 Subject: [PATCH 080/134] nbd: add sanity check for first_minor maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- When user pass 0x100000 as index, nbd will end up create sysfs dir "/sys/block/43:0": nbd_dev_add disk->first_minor = index << part_shift -> default part_shift is 5, 0x100000 << 5 = 0x2000000 device_add_disk blk_alloc_devt MKDEV(disk->major, disk->first_minor + part->partno) -> (0x2b << 20) | (0x2000000) = 0x2b00000 register_disk device_add device_create_sys_dev_entry format_dev_t MAJOR(devt) -> 0x2b00000 >> 20 = 0x2b MINOR(devt) -> 0x2b00000 & 0xfffff = 0 sysfs_create_link -> /sys/block/43:0 If nbd created device with index 0 aready, then sysfs will compalin about dumplicated creation. On the other hand, the similar dumplicated creation will happen if "index << part_shift" over flow to a value that is less than MINORMASK. Thus fix the problem by adding sanity check for first_minor. Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices") Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/block/nbd.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f3896f1c27f4..07b06fc6f70e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1821,7 +1821,18 @@ static int nbd_dev_add(int index) refcount_set(&nbd->refs, 1); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; + + /* + * Too big index can cause duplicate creation of sysfs files/links, + * because MKDEV() expect that the max first minor is MINORMASK, or + * index << part_shift can overflow. + */ disk->first_minor = index << part_shift; + if (disk->first_minor < index || disk->first_minor > MINORMASK) { + err = -EINVAL; + goto out_free_tags; + } + disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); -- Gitee From 426e5a4f192cce28968b64011bf7f2c34bf95b58 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Nov 2021 19:50:27 +0800 Subject: [PATCH 081/134] io_uring: deduplicate failing task_work_add mainline inclusion from mainline-5.12-rc1 commit eab30c4d20dc761d463445e5130421863ff81505 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=eab30c4d20dc761d463445e5130421863ff81505 Signed-off-by: Yu Changchun --------------------------- When io_req_task_work_add() fails, the request will be cancelled by enqueueing via task_works of io-wq. Extract a function for that. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Conflicts: fs/io_uring.c [ 355fb9e2b78e78("io_uring: remove 'twa_signal_ok' deadlock work-around") is not applied. ] Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/io_uring.c | 46 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 104dff9c7131..82b301871224 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2056,6 +2056,16 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) return ret; } +static void io_req_task_work_add_fallback(struct io_kiocb *req, + void (*cb)(struct callback_head *)) +{ + struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq); + + init_task_work(&req->task_work, cb); + task_work_add(tsk, &req->task_work, TWA_NONE); + wake_up_process(tsk); +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; @@ -2113,14 +2123,8 @@ static void io_req_task_queue(struct io_kiocb *req) percpu_ref_get(&req->ctx->refs); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_req_task_cancel); } static void io_queue_next(struct io_kiocb *req) @@ -2239,13 +2243,8 @@ static void io_free_req_deferred(struct io_kiocb *req) init_task_work(&req->task_work, io_put_req_deferred_cb); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_put_req_deferred_cb); } static inline void io_put_req_deferred(struct io_kiocb *req, int refs) @@ -3345,15 +3344,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - /* queue just for cancelation */ - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_req_task_cancel); return 1; } @@ -4963,12 +4955,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, twa_signal_ok); if (unlikely(ret)) { - struct task_struct *tsk; - WRITE_ONCE(poll->canceled, true); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + io_req_task_work_add_fallback(req, func); } return 1; } -- Gitee From 99af00b884a4beb84425bed3c28403d63947c325 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Nov 2021 19:50:28 +0800 Subject: [PATCH 082/134] io_uring: don't take uring_lock during iowq cancel mainline inclusion from mainline-5.12-rc1 commit 792bb6eb862333658bf1bd2260133f0507e2da8d category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=792bb6eb862333658bf1bd2260133f0507e2da8d Signed-off-by: Yu Changchun --------------------------- [ 97.866748] a.out/2890 is trying to acquire lock: [ 97.867829] ffff8881046763e8 (&ctx->uring_lock){+.+.}-{3:3}, at: io_wq_submit_work+0x155/0x240 [ 97.869735] [ 97.869735] but task is already holding lock: [ 97.871033] ffff88810dfe0be8 (&ctx->uring_lock){+.+.}-{3:3}, at: __x64_sys_io_uring_enter+0x3f0/0x5b0 [ 97.873074] [ 97.873074] other info that might help us debug this: [ 97.874520] Possible unsafe locking scenario: [ 97.874520] [ 97.875845] CPU0 [ 97.876440] ---- [ 97.877048] lock(&ctx->uring_lock); [ 97.877961] lock(&ctx->uring_lock); [ 97.878881] [ 97.878881] *** DEADLOCK *** [ 97.878881] [ 97.880341] May be due to missing lock nesting notation [ 97.880341] [ 97.881952] 1 lock held by a.out/2890: [ 97.882873] #0: ffff88810dfe0be8 (&ctx->uring_lock){+.+.}-{3:3}, at: __x64_sys_io_uring_enter+0x3f0/0x5b0 [ 97.885108] [ 97.885108] stack backtrace: [ 97.890457] Call Trace: [ 97.891121] dump_stack+0xac/0xe3 [ 97.891972] __lock_acquire+0xab6/0x13a0 [ 97.892940] lock_acquire+0x2c3/0x390 [ 97.894894] __mutex_lock+0xae/0x9f0 [ 97.901101] io_wq_submit_work+0x155/0x240 [ 97.902112] io_wq_cancel_cb+0x162/0x490 [ 97.904126] io_async_find_and_cancel+0x3b/0x140 [ 97.905247] io_issue_sqe+0x86d/0x13e0 [ 97.909122] __io_queue_sqe+0x10b/0x550 [ 97.913971] io_queue_sqe+0x235/0x470 [ 97.914894] io_submit_sqes+0xcce/0xf10 [ 97.917872] __x64_sys_io_uring_enter+0x3fb/0x5b0 [ 97.921424] do_syscall_64+0x2d/0x40 [ 97.922329] entry_SYSCALL_64_after_hwframe+0x44/0xa9 While holding uring_lock, e.g. from inline execution, async cancel request may attempt cancellations through io_wq_submit_work, which may try to grab a lock. Delay it to task_work, so we do it from a clean context and don't have to worry about locking. Cc: # 5.5+ Fixes: c07e6719511e ("io_uring: hold uring_lock while completing failed polled io in io_wq_submit_work()") Reported-by: Abaci Reported-by: Hao Xu Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Conflicts: [ 5280f7e530f7("io_uring/io-wq: return 2-step work swap scheme") is not applied. ] Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 82b301871224..f3078708fdb8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6135,7 +6135,11 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == IO_WQ_WORK_CANCEL) { - ret = -ECANCELED; + /* io-wq is going to take down one */ + refcount_inc(&req->refs); + percpu_ref_get(&req->ctx->refs); + io_req_task_work_add_fallback(req, io_req_task_cancel); + return io_steal_work(req); } if (!ret) { -- Gitee From 851d3c8e596387cf48e9d62fdfc153cdd7e45e17 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Mon, 15 Nov 2021 19:50:29 +0800 Subject: [PATCH 083/134] ALSA: timer: Fix use-after-free problem maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Reference: https://lore.kernel.org/all/3b02dd76-d952-e38e-bc0c-c8a121919720@huawei.com/T/#u Signed-off-by: Yu Changchun ------------------------------------------------- When the timer instance was add into ack_list but was not currently in process, the user could stop it via snd_timer_stop1() without delete it from the ack_list. Then the user could free the timer instance and when it was actually processed UAF occurred. This issue could be reproduced via testcase snd_timer01 in ltp - running several instances of that testcase at the same time. What I actually met was that the ack_list of the timer broken and the kernel went into deadloop with irqoff. That could be detected by hardlockup detector on board or when we run it on qemu, we could use gdb to dump the ack_list when the console has no response. To fix this issue, we delete the timer instance from ack_list and active_list unconditionally in snd_timer_stop1(). Signed-off-by: Wang Wensheng Suggested-by: Takashi Iwai Reviewed-by: Weilong Chen Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- sound/core/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index c15c8314671b..963ddd9e7715 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -624,13 +624,13 @@ static int snd_timer_stop1(struct snd_timer_instance *timeri, bool stop) if (!timer) return -EINVAL; spin_lock_irqsave(&timer->lock, flags); + list_del_init(&timeri->ack_list); + list_del_init(&timeri->active_list); if (!(timeri->flags & (SNDRV_TIMER_IFLG_RUNNING | SNDRV_TIMER_IFLG_START))) { result = -EBUSY; goto unlock; } - list_del_init(&timeri->ack_list); - list_del_init(&timeri->active_list); if (timer->card && timer->card->shutdown) goto unlock; if (stop) { -- Gitee From 2ac73854cb405bbb631ab4ad0f3acab050494770 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:52:11 +0800 Subject: [PATCH 084/134] blk-cgroup: synchronize blkg creation against policy deactivation mainline inclusion from mainline commit 0c9d338c8443b06da8e8d3bfce824c5ea6d3488f category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c9d338c8443b06da8e8d3bfce824c5ea6d3488f Signed-off-by: Yu Changchun --------------------------- Our test reports a null pointer dereference: [ 168.534653] ================================================================== [ 168.535614] Disabling lock debugging due to kernel taint [ 168.536346] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 168.537274] #PF: supervisor read access in kernel mode [ 168.537964] #PF: error_code(0x0000) - not-present page [ 168.538667] PGD 0 P4D 0 [ 168.539025] Oops: 0000 [#1] PREEMPT SMP KASAN [ 168.539656] CPU: 13 PID: 759 Comm: bash Tainted: G B 5.15.0-rc2-next-202100 [ 168.540954] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_0738364 [ 168.542736] RIP: 0010:bfq_pd_init+0x88/0x1e0 [ 168.543318] Code: 98 00 00 00 e8 c9 e4 5b ff 4c 8b 65 00 49 8d 7c 24 08 e8 bb e4 5b ff 4d0 [ 168.545803] RSP: 0018:ffff88817095f9c0 EFLAGS: 00010002 [ 168.546497] RAX: 0000000000000001 RBX: ffff888101a1c000 RCX: 0000000000000000 [ 168.547438] RDX: 0000000000000003 RSI: 0000000000000002 RDI: ffff888106553428 [ 168.548402] RBP: ffff888106553400 R08: ffffffff961bcaf4 R09: 0000000000000001 [ 168.549365] R10: ffffffffa2e16c27 R11: fffffbfff45c2d84 R12: 0000000000000000 [ 168.550291] R13: ffff888101a1c098 R14: ffff88810c7a08c8 R15: ffffffffa55541a0 [ 168.551221] FS: 00007fac75227700(0000) GS:ffff88839ba80000(0000) knlGS:0000000000000000 [ 168.552278] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 168.553040] CR2: 0000000000000008 CR3: 0000000165ce7000 CR4: 00000000000006e0 [ 168.554000] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 168.554929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 168.555888] Call Trace: [ 168.556221] [ 168.556510] blkg_create+0x1c0/0x8c0 [ 168.556989] blkg_conf_prep+0x574/0x650 [ 168.557502] ? stack_trace_save+0x99/0xd0 [ 168.558033] ? blkcg_conf_open_bdev+0x1b0/0x1b0 [ 168.558629] tg_set_conf.constprop.0+0xb9/0x280 [ 168.559231] ? kasan_set_track+0x29/0x40 [ 168.559758] ? kasan_set_free_info+0x30/0x60 [ 168.560344] ? tg_set_limit+0xae0/0xae0 [ 168.560853] ? do_sys_openat2+0x33b/0x640 [ 168.561383] ? do_sys_open+0xa2/0x100 [ 168.561877] ? __x64_sys_open+0x4e/0x60 [ 168.562383] ? __kasan_check_write+0x20/0x30 [ 168.562951] ? copyin+0x48/0x70 [ 168.563390] ? _copy_from_iter+0x234/0x9e0 [ 168.563948] tg_set_conf_u64+0x17/0x20 [ 168.564467] cgroup_file_write+0x1ad/0x380 [ 168.565014] ? cgroup_file_poll+0x80/0x80 [ 168.565568] ? __mutex_lock_slowpath+0x30/0x30 [ 168.566165] ? pgd_free+0x100/0x160 [ 168.566649] kernfs_fop_write_iter+0x21d/0x340 [ 168.567246] ? cgroup_file_poll+0x80/0x80 [ 168.567796] new_sync_write+0x29f/0x3c0 [ 168.568314] ? new_sync_read+0x410/0x410 [ 168.568840] ? __handle_mm_fault+0x1c97/0x2d80 [ 168.569425] ? copy_page_range+0x2b10/0x2b10 [ 168.570007] ? _raw_read_lock_bh+0xa0/0xa0 [ 168.570622] vfs_write+0x46e/0x630 [ 168.571091] ksys_write+0xcd/0x1e0 [ 168.571563] ? __x64_sys_read+0x60/0x60 [ 168.572081] ? __kasan_check_write+0x20/0x30 [ 168.572659] ? do_user_addr_fault+0x446/0xff0 [ 168.573264] __x64_sys_write+0x46/0x60 [ 168.573774] do_syscall_64+0x35/0x80 [ 168.574264] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 168.574960] RIP: 0033:0x7fac74915130 [ 168.575456] Code: 73 01 c3 48 8b 0d 58 ed 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 444 [ 168.577969] RSP: 002b:00007ffc3080e288 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 168.578986] RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007fac74915130 [ 168.579937] RDX: 0000000000000009 RSI: 000056007669f080 RDI: 0000000000000001 [ 168.580884] RBP: 000056007669f080 R08: 000000000000000a R09: 00007fac75227700 [ 168.581841] R10: 000056007655c8f0 R11: 0000000000000246 R12: 0000000000000009 [ 168.582796] R13: 0000000000000001 R14: 00007fac74be55e0 R15: 00007fac74be08c0 [ 168.583757] [ 168.584063] Modules linked in: [ 168.584494] CR2: 0000000000000008 [ 168.584964] ---[ end trace 2475611ad0f77a1a ]--- This is because blkg_alloc() is called from blkg_conf_prep() without holding 'q->queue_lock', and elevator is exited before blkg_create(): thread 1 thread 2 blkg_conf_prep spin_lock_irq(&q->queue_lock); blkg_lookup_check -> return NULL spin_unlock_irq(&q->queue_lock); blkg_alloc blkcg_policy_enabled -> true pd = ->pd_alloc_fn blkg->pd[i] = pd blk_mq_exit_sched bfq_exit_queue blkcg_deactivate_policy spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); spin_unlock_irq(&q->queue_lock); q->elevator = NULL; spin_lock_irq(&q->queue_lock); blkg_create if (blkg->pd[i]) ->pd_init_fn -> q->elevator is NULL spin_unlock_irq(&q->queue_lock); Because blkcg_deactivate_policy() requires queue to be frozen, we can grab q_usage_counter to synchoronize blkg_conf_prep() against blkcg_deactivate_policy(). Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support") Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20211020014036.2141723-1-yukuai3@huawei.com Signed-off-by: Jens Axboe Conflict: block/blk-cgroup.c - commit ed6cddefdfd3 ("block: convert the rest of block to bdev_get_queue") is not backported. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-cgroup.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5b19665bc486..37a5dbd2c4e4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -620,6 +620,14 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, q = disk->queue; + /* + * blkcg_deactivate_policy() requires queue to be frozen, we can grab + * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). + */ + ret = blk_queue_enter(q, 0); + if (ret) + return ret; + rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -689,6 +697,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: + blk_queue_exit(q); ctx->disk = disk; ctx->blkg = blkg; ctx->body = input; @@ -701,6 +710,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_unlock(); fail: put_disk_and_module(disk); + blk_queue_exit(q); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue -- Gitee From 1249d6447661fab8737fc872a500116c622b349a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 15 Nov 2021 19:52:12 +0800 Subject: [PATCH 085/134] blk-cgroup: fix missing put device in error path from blkg_conf_pref() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- If blk_queue_enter() failed due to queue is dying, the blkdev_put_no_open() is needed because blkcg_conf_open_bdev() succeeded. Fixes: 0c9d338c8443 ("blk-cgroup: synchronize blkg creation against policy deactivation") Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 37a5dbd2c4e4..40f15807efec 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -626,7 +626,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, */ ret = blk_queue_enter(q, 0); if (ret) - return ret; + goto fail; rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -662,13 +662,13 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, new_blkg = blkg_alloc(pos, q, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail; + goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail; + goto fail_exit_queue; } rcu_read_lock(); @@ -708,9 +708,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); +fail_exit_queue: + blk_queue_exit(q); fail: put_disk_and_module(disk); - blk_queue_exit(q); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue -- Gitee From 0a75f7dd4ebd75263f33680d7dd6c8451f48757b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:13 +0800 Subject: [PATCH 086/134] ubifs: rename_whiteout: Fix double free for whiteout_ui->data maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- 'whiteout_ui->data' will be freed twice if space budget fail for rename whiteout operation as following process: rename_whiteout dev = kmalloc whiteout_ui->data = dev kfree(whiteout_ui->data) // Free first time iput(whiteout) ubifs_free_inode kfree(ui->data) // Double free! KASAN reports: Reviewed-by: Zhang Yi ================================================================== BUG: KASAN: double-free or invalid-free in ubifs_free_inode+0x4f/0x70 Call Trace: kfree+0x117/0x490 ubifs_free_inode+0x4f/0x70 [ubifs] i_callback+0x30/0x60 rcu_do_batch+0x366/0xac0 __do_softirq+0x133/0x57f Allocated by task 1506: kmem_cache_alloc_trace+0x3c2/0x7a0 do_rename+0x9b7/0x1150 [ubifs] ubifs_rename+0x106/0x1f0 [ubifs] do_syscall_64+0x35/0x80 Freed by task 1506: kfree+0x117/0x490 do_rename.cold+0x53/0x8a [ubifs] ubifs_rename+0x106/0x1f0 [ubifs] do_syscall_64+0x35/0x80 The buggy address belongs to the object at ffff88810238bed8 which belongs to the cache kmalloc-8 of size 8 ================================================================== Let ubifs_free_inode() free 'whiteout_ui->data'. BTW, delete unused assignment 'whiteout_ui->data_len = 0', process 'ubifs_evict_inode() -> ubifs_jnl_delete_inode() -> ubifs_jnl_write_inode()' doesn't need it (because 'inc_nlink(whiteout)' won't be excuted by 'goto out_release', and the nlink of whiteout inode is 0). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ad90a3a64293..48dbd91d8b0e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1423,8 +1423,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, err = ubifs_budget_space(c, &wht_req); if (err) { - kfree(whiteout_ui->data); - whiteout_ui->data_len = 0; iput(whiteout); goto out_release; } -- Gitee From 5c176c8ffe591fd09fed83d7cb9a42e8cd14d72d Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:14 +0800 Subject: [PATCH 087/134] ubifs: Fix deadlock in concurrent rename whiteout and inode writeback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Following hung tasks: [ 77.028764] task:kworker/u8:4 state:D stack: 0 pid: 132 [ 77.028820] Call Trace: [ 77.029027] schedule+0x8c/0x1b0 [ 77.029067] mutex_lock+0x50/0x60 [ 77.029074] ubifs_write_inode+0x68/0x1f0 [ubifs] [ 77.029117] __writeback_single_inode+0x43c/0x570 [ 77.029128] writeback_sb_inodes+0x259/0x740 [ 77.029148] wb_writeback+0x107/0x4d0 [ 77.029163] wb_workfn+0x162/0x7b0 [ 92.390442] task:aa state:D stack: 0 pid: 1506 [ 92.390448] Call Trace: [ 92.390458] schedule+0x8c/0x1b0 [ 92.390461] wb_wait_for_completion+0x82/0xd0 [ 92.390469] __writeback_inodes_sb_nr+0xb2/0x110 [ 92.390472] writeback_inodes_sb_nr+0x14/0x20 [ 92.390476] ubifs_budget_space+0x705/0xdd0 [ubifs] [ 92.390503] do_rename.cold+0x7f/0x187 [ubifs] [ 92.390549] ubifs_rename+0x8b/0x180 [ubifs] [ 92.390571] vfs_rename+0xdb2/0x1170 [ 92.390580] do_renameat2+0x554/0x770 , are caused by concurrent rename whiteout and inode writeback processes: rename_whiteout(Thread 1) wb_workfn(Thread2) ubifs_rename do_rename lock_4_inodes (Hold ui_mutex) ubifs_budget_space make_free_space shrink_liability __writeback_inodes_sb_nr bdi_split_work_to_wbs (Queue new wb work) wb_do_writeback(wb work) __writeback_single_inode ubifs_write_inode LOCK(ui_mutex) ↑ wb_wait_for_completion (Wait wb work) <-- deadlock! Reproducer (Detail program in [Link]): 1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT) 2. Consume out of space before kernel(mdelay) doing budget for whiteout Fix it by doing whiteout space budget before locking ubifs inodes. BTW, it also fixes wrong goto tag 'out_release' in whiteout budget error handling path(It should at least recover dir i_size and unlock 4 ubifs inodes). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733 Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/dir.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 48dbd91d8b0e..e0b2dbc36517 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1322,6 +1322,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_WHITEOUT) { union ubifs_dev_desc *dev = NULL; + struct ubifs_budget_req wht_req; dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) { @@ -1343,6 +1344,20 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); + + memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); + wht_req.dirtied_ino = 1; + wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8); + /* + * To avoid deadlock between space budget (holds ui_mutex and + * waits wb work) and writeback work(waits ui_mutex), do space + * budget before ubifs inodes locked. + */ + err = ubifs_budget_space(c, &wht_req); + if (err) { + iput(whiteout); + goto out_release; + } } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); @@ -1417,16 +1432,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, } if (whiteout) { - struct ubifs_budget_req wht_req = { .dirtied_ino = 1, - .dirtied_ino_d = \ - ALIGN(ubifs_inode(whiteout)->data_len, 8) }; - - err = ubifs_budget_space(c, &wht_req); - if (err) { - iput(whiteout); - goto out_release; - } - inc_nlink(whiteout); mark_inode_dirty(whiteout); -- Gitee From a7932d34e546c17056949d064c43d1aba34f2477 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:15 +0800 Subject: [PATCH 088/134] ubifs: Fix wrong number of inodes locked by ui_mutex in ubifs_inode comment maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Since 9ec64962afb1702f75b("ubifs: Implement RENAME_EXCHANGE") and 9e0a1fff8db56eaaebb("ubifs: Implement RENAME_WHITEOUT") are applied, ubifs_rename locks and changes 4 ubifs inodes, correct the comment for ui_mutex in ubifs_inode. Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/ubifs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index daad22263cb2..9a4a3191ed07 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -372,7 +372,7 @@ struct ubifs_gced_idx_leb { * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS * operation. This way UBIFS makes sure the inode fields are consistent. For - * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and * write-back must not write any of them before we have finished. * * The second reason is budgeting - UBIFS has to budget all operations. If an -- Gitee From 7b9e731eedbefc98a85c752f411026cfb400ca75 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:16 +0800 Subject: [PATCH 089/134] ubifs: Add missing iput if do_tmpfile() failed in rename whiteout maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- whiteout inode should be put when do_tmpfile() failed if inode has been initialized. Otherwise we will get following warning during umount: UBIFS error (ubi0:0 pid 1494): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth == 0, in fs/ubifs/super.c:1930 VFS: Busy inodes after unmount of ubifs. Self-destruct in 5 seconds. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e0b2dbc36517..9888858bb9e5 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1332,6 +1332,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); if (err) { + if (whiteout) + iput(whiteout); kfree(dev); goto out_release; } -- Gitee From 2c7f9ab9ec94878c05d9ee9a9d37068639b16d39 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:17 +0800 Subject: [PATCH 090/134] ubifs: Rename whiteout atomically maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/dir.c | 144 +++++++++++++++++++++++++++++---------------- fs/ubifs/journal.c | 52 +++++++++++++--- 2 files changed, 136 insertions(+), 60 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9888858bb9e5..5d64d829eda1 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -348,8 +348,56 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int do_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err; + umode_t mode = S_IFCHR | WHITEOUT_MODE; + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm; + + /* + * Create an inode('nlink = 1') for whiteout without updating journal, + * let ubifs_jnl_rename() store it on flash to complete rename whiteout + * atomically. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + return ERR_PTR(err); + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + /* The dir size is updated by do_rename. */ + insert_inode_hash(inode); + + return inode; + +out_inode: + make_bad_inode(inode); + iput(inode); +out_free: + fscrypt_free_filename(&nm); + ubifs_err(c, "cannot create whiteout file, error %d", err); + return ERR_PTR(err); +} + +static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, + umode_t mode) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; @@ -391,25 +439,13 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, } ui = ubifs_inode(inode); - if (whiteout) { - init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); - ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); - } - err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) goto out_inode; mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - - if (whiteout) { - mark_inode_dirty(inode); - drop_nlink(inode); - *whiteout = inode; - } else { - d_tmpfile(dentry, inode); - } + d_tmpfile(dentry, inode); ubifs_assert(c, ui->dirty); instantiated = 1; @@ -440,12 +476,6 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, return err; } -static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode) -{ - return do_tmpfile(dir, dentry, mode, NULL); -} - /** * vfs_dent_type - get VFS directory entry type. * @type: UBIFS directory entry type @@ -1262,17 +1292,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct ubifs_budget_req wht_req; struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; /* - * Budget request settings: deletion direntry, new direntry, removing - * the old inode, and changing old and new parent directory inodes. + * Budget request settings: + * req: deletion direntry, new direntry, removing the old inode, + * and changing old and new parent directory inodes. * - * However, this operation also marks the target inode as dirty and - * does not write it, so we allocate budget for the target inode - * separately. + * wht_req: new whiteout inode for RENAME_WHITEOUT. + * + * ino_req: marks the target inode as dirty and does not write it. */ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", @@ -1322,7 +1354,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_WHITEOUT) { union ubifs_dev_desc *dev = NULL; - struct ubifs_budget_req wht_req; dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) { @@ -1330,26 +1361,26 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_release; } - err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); - if (err) { - if (whiteout) - iput(whiteout); + /* + * The whiteout inode without dentry is pinned in memory, + * umount won't happen during rename process because we + * got parent dentry. + */ + whiteout = create_whiteout(old_dir, old_dentry); + if (IS_ERR(whiteout)) { + err = PTR_ERR(whiteout); kfree(dev); goto out_release; } - spin_lock(&whiteout->i_lock); - whiteout->i_state |= I_LINKABLE; - spin_unlock(&whiteout->i_lock); - whiteout_ui = ubifs_inode(whiteout); whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); - wht_req.dirtied_ino = 1; - wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8); + wht_req.new_ino = 1; + wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8); /* * To avoid deadlock between space budget (holds ui_mutex and * waits wb work) and writeback work(waits ui_mutex), do space @@ -1357,6 +1388,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, */ err = ubifs_budget_space(c, &wht_req); if (err) { + /* + * Whiteout inode can not be written on flash by + * ubifs_jnl_write_inode(), because it's neither + * dirty nor zero-nlink. + */ iput(whiteout); goto out_release; } @@ -1431,17 +1467,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); if (unlink && IS_SYNC(new_inode)) sync = 1; - } - - if (whiteout) { - inc_nlink(whiteout); - mark_inode_dirty(whiteout); - - spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; - spin_unlock(&whiteout->i_lock); - - iput(whiteout); + /* + * S_SYNC flag of whiteout inherits from the old_dir, and we + * have already checked the old dir inode. So there is no need + * to check whiteout. + */ } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, @@ -1452,6 +1482,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &req); + if (whiteout) { + ubifs_release_budget(c, &wht_req); + iput(whiteout); + } + mutex_lock(&old_inode_ui->ui_mutex); release = old_inode_ui->dirty; mark_inode_dirty_sync(old_inode); @@ -1460,11 +1495,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + /* + * Rename finished here. Although old inode cannot be updated + * on flash, old ctime is not a big problem, don't return err + * code to userspace. + */ + old_inode->i_sb->s_op->write_inode(old_inode, NULL); fscrypt_free_filename(&old_nm); fscrypt_free_filename(&new_nm); - return err; + return 0; out_cancel: if (unlink) { @@ -1485,11 +1525,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, inc_nlink(old_dir); } } + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { - drop_nlink(whiteout); + ubifs_release_budget(c, &wht_req); iput(whiteout); } - unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); out_release: ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 230717384a38..72586512e51f 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1207,9 +1207,9 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up - * to 4 inodes and 2 directory entries. It marks the written inodes as clean - * and returns zero on success. In case of failure, a negative error code is - * returned. + * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) + * and 2 directory entries. It marks the written inodes as clean and returns + * zero on success. In case of failure, a negative error code is returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *old_inode, @@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; - int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0; + int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0; int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); - struct ubifs_inode *new_ui; + struct ubifs_inode *new_ui, *whiteout_ui; u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; + u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ]; u8 hash_dent1[UBIFS_HASH_ARR_SZ]; u8 hash_dent2[UBIFS_HASH_ARR_SZ]; @@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } else ilen = 0; + if (whiteout) { + whiteout_ui = ubifs_inode(whiteout); + ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex)); + ubifs_assert(c, whiteout->i_nlink == 1); + ubifs_assert(c, !whiteout_ui->dirty); + wlen = UBIFS_INO_NODE_SZ; + wlen += whiteout_ui->data_len; + } else + wlen = 0; + aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); - len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + + ALIGN(wlen, 8) + ALIGN(plen, 8); if (move) len += plen; @@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p += ALIGN(ilen, 8); } + if (whiteout) { + pack_inode(c, p, whiteout, 0); + err = ubifs_node_calc_hash(c, p, hash_whiteout_inode); + if (err) + goto out_release; + + p += ALIGN(wlen, 8); + } + if (!move) { pack_inode(c, p, old_dir, 1); err = ubifs_node_calc_hash(c, p, hash_old_dir); @@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (new_inode) ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, new_inode->i_ino); + if (whiteout) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + whiteout->i_ino); } release_head(c, BASEHD); @@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm); if (err) goto out_ro; - - ubifs_delete_orphan(c, whiteout->i_ino); } else { err = ubifs_add_dirt(c, lnum, dlen2); if (err) @@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, offs += ALIGN(ilen, 8); } + if (whiteout) { + ino_key_init(c, &key, whiteout->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, wlen, + hash_whiteout_inode); + if (err) + goto out_ro; + offs += ALIGN(wlen, 8); + } + ino_key_init(c, &key, old_dir->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir); if (err) @@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, new_ui->synced_i_size = new_ui->ui_size; spin_unlock(&new_ui->ui_lock); } + /* + * No need to mark whiteout inode clean. + * Whiteout don't have non-zero size, no need to update + * synced_i_size for whiteout_ui. + */ mark_inode_clean(c, ubifs_inode(old_dir)); if (move) mark_inode_clean(c, ubifs_inode(new_dir)); -- Gitee From 50dc54cd74c25120c2011ae0974d7cf388f8df67 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:18 +0800 Subject: [PATCH 091/134] ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/dir.c | 60 +++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 5d64d829eda1..270cc23cda47 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -396,6 +396,32 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) return ERR_PTR(err); } +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -403,7 +429,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, struct ubifs_info *c = dir->i_sb->s_fs_info; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; - struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); + struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; @@ -451,18 +477,18 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, instantiated = 1; mutex_unlock(&ui->ui_mutex); - mutex_lock(&dir_ui->ui_mutex); + lock_2_inodes(dir, inode); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); return 0; out_cancel: - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); out_inode: make_bad_inode(inode); if (!instantiated) @@ -689,32 +715,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) return 0; } -/** - * lock_2_inodes - a wrapper for locking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - * - * We do not implement any tricks to guarantee strict lock ordering, because - * VFS has already done it for us on the @i_mutex. So this is just a simple - * wrapper function. - */ -static void lock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); -} - -/** - * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - */ -static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); -} - static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { -- Gitee From c512b1dd7c983d105b8ef3ad3bcab1ecba314bf1 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:19 +0800 Subject: [PATCH 092/134] ubifs: Rectify space amount budget for mkdir/tmpfile operations maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- UBIFS should make sure the flash has enough space to store dirty (Data that is newer than disk) data (in memory), space budget is exactly designed to do that. If space budget calculates less data than we need, 'make_reservation()' will do more work(return -ENOSPC if no free space lelf, sometimes we can see "cannot reserve xxx bytes in jhead xxx, error -28" in ubifs error messages) with ubifs inodes locked, which may effect other syscalls. A simple way to decide how much space do we need when make a budget: See how much space is needed by 'make_reservation()' in ubifs_jnl_xxx() function according to corresponding operation. It's better to report ENOSPC in ubifs_budget_space(), as early as we can. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Fixes: 1e51764a3c2ac05 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/dir.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 270cc23cda47..3db1c75fda5b 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -427,15 +427,18 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; /* - * Budget request settings: new dirty inode, new direntry, - * budget for dirtied inode will be released via writeback. + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + * Allocate budget separately for new dirtied inode, the budget will + * be released via writeback. */ dbg_gen("dent '%pd', mode %#hx in dir ino %lu", @@ -977,7 +980,8 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct fscrypt_name nm; /* -- Gitee From d71e5f39a94f557e998f9870294e6c0070884633 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:20 +0800 Subject: [PATCH 093/134] ubifs: setflags: Make dirtied_ino_d 8 bytes aligned maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Make 'ui->data_len' aligned with 8 bytes before it is assigned to dirtied_ino_d. Since 8871d84c8f8b0c6b("ubifs: convert to fileattr") applied, 'setflags()' only affects regular files and directories, only xattr inode, symlink inode and special inode(pipe/char_dev/block_dev) have none- zero 'ui->data_len' field, so assertion '!(req->dirtied_ino_d & 7)' cannot fail in ubifs_budget_space(). To avoid assertion fails in future evolution(eg. setflags can operate special inodes), it's better to make dirtied_ino_d 8 bytes aligned, after all aligned size is still zero for regular files. Fixes: 1e51764a3c2ac05a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 4363d85a3fd4..fb0fc0e6be99 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -107,7 +107,7 @@ static int setflags(struct inode *inode, int flags) struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) -- Gitee From 99cc5a764d3fad6230d7bc3c909adf1afa600e69 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:21 +0800 Subject: [PATCH 094/134] ubifs: Fix read out-of-bounds in ubifs_wbuf_write_nolock() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Function ubifs_wbuf_write_nolock() may access buf out of bounds in following process: ubifs_wbuf_write_nolock(): aligned_len = ALIGN(len, 8); // Assume len = 4089, aligned_len = 4096 if (aligned_len <= wbuf->avail) ... // Not satisfy if (wbuf->used) { ubifs_leb_write() // Fill some data in avail wbuf len -= wbuf->avail; // len is still not 8-bytes aligned aligned_len -= wbuf->avail; } n = aligned_len >> c->max_write_shift; if (n) { n <<= c->max_write_shift; err = ubifs_leb_write(c, wbuf->lnum, buf + written, wbuf->offs, n); // n > len, read out of bounds less than 8(n-len) bytes } , which can be catched by KASAN: ========================================================= BUG: KASAN: slab-out-of-bounds in ecc_sw_hamming_calculate+0x1dc/0x7d0 Read of size 4 at addr ffff888105594ff8 by task kworker/u8:4/128 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: kasan_report.cold+0x81/0x165 nand_write_page_swecc+0xa9/0x160 ubifs_leb_write+0xf2/0x1b0 [ubifs] ubifs_wbuf_write_nolock+0x421/0x12c0 [ubifs] write_head+0xdc/0x1c0 [ubifs] ubifs_jnl_write_inode+0x627/0x960 [ubifs] wb_workfn+0x8af/0xb80 Function ubifs_wbuf_write_nolock() accepts that parameter 'len' is not 8 bytes aligned, the 'len' represents the true length of buf (which is allocated in 'ubifs_jnl_xxx', eg. ubifs_jnl_write_inode), so ubifs_wbuf_write_nolock() must handle the length read from 'buf' carefully to write leb safely. Fetch a reproducer in [Link]. Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214785 Reported-by: Chengsong Ke Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/io.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 937bd3fdad40..997450410408 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -833,16 +833,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->max_write_shift; + int m = n - 1; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n); + + if (m) { + /* '(n-1)<max_write_shift < len' is always true. */ + m <<= c->max_write_shift; + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, m); + if (err) + goto out; + wbuf->offs += m; + aligned_len -= m; + len -= m; + written += m; + } + + /* + * The non-written len of buf may be less than 'n' because + * parameter 'len' is not 8 bytes aligned, so here we read + * min(len, n) bytes from buf. + */ + n = 1 << c->max_write_shift; + memcpy(wbuf->buf, buf + written, min(len, n)); + if (n > len) { + ubifs_assert(c, n - len < 8); + ubifs_pad(c, wbuf->buf + len, n - len); + } + + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n); if (err) goto out; wbuf->offs += n; aligned_len -= n; - len -= n; + len -= min(len, n); written += n; } -- Gitee From 4cc4bc1aa03a521e5dc08480586942de0b52c655 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:22 +0800 Subject: [PATCH 095/134] ubifs: Fix to add refcount once page is set private maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- MM defined the rule [1] very clearly that once page was set with PG_private flag, we should increment the refcount in that page, also main flows like pageout(), migrate_page() will assume there is one additional page reference count if page_has_private() returns true. Otherwise, we may get a BUG in page migration: page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8 index:0xe2 pfn:0x14c12 aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e" flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0| zone=1|lastcpupid=0x1fffff) page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0) ------------[ cut here ]------------ kernel BUG at include/linux/page_ref.h:184! invalid opcode: 0000 [#1] SMP CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5 RIP: 0010:migrate_page_move_mapping+0xac3/0xe70 Call Trace: ubifs_migrate_page+0x22/0xc0 [ubifs] move_to_new_page+0xb4/0x600 migrate_pages+0x1523/0x1cc0 compact_zone+0x8c5/0x14b0 kcompactd+0x2bc/0x560 kthread+0x18c/0x1e0 ret_from_fork+0x1f/0x30 The BUG is caused by following process: PA(cpu 1) PB(cpu 2) ubifs_write_begin page = grab_cache_page_write_begin (refcnf = 3, for page creation process) ubifs_write_end SetPagePrivate(page) unlock_page(page) // refcnt=3 put_page(page) page_ref_dec_and_test lock(page) ... ubifs_migrate_page migrate_page_move_mapping expected_page_refs get 3 (1 + mapping[1] + private[1]) page_ref_freeze // refcnt = 0 atomic_dec_and_test(0 - 1 = -1) page_ref_unfreeze VM_BUG_ON_PAGE(-1 != 0, page) Actually zhangjun has tried to fix this problem [2] by recalculating page refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because just like Kirill suggested in [2], we need to check all users of page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting refcount when setting/clearing private for a page. BTW, according to [4], we set 'page->private' as 1 because ubifs just simply SetPagePrivate(). And, [5] provided a common helper to set/clear page private, ubifs can use this helper following the example of iomap, afs, btrfs, etc. Jump [6] to find a reproducer. [1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com [2] https://www.spinics.net/lists/linux-mtd/msg04018.html [3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html [4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.infradead.org [5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.com [6] https://bugzilla.kernel.org/show_bug.cgi?id=214961 Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index d44c8e14810c..df46f2d3ff8b 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } if (!PagePrivate(page)) { - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } @@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len) release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); kunmap(page); @@ -1303,7 +1303,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); } @@ -1470,8 +1470,8 @@ static int ubifs_migrate_page(struct address_space *mapping, return rc; if (PagePrivate(page)) { - ClearPagePrivate(page); - SetPagePrivate(newpage); + detach_page_private(page); + attach_page_private(newpage, (void *)1); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1495,7 +1495,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) return 0; ubifs_assert(c, PagePrivate(page)); ubifs_assert(c, 0); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); return 1; } @@ -1566,7 +1566,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) else { if (!PageChecked(page)) ubifs_convert_page_budget(c); - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } -- Gitee From 71cb569c7a438305161e0ac6b6967d410bbb7aad Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:23 +0800 Subject: [PATCH 096/134] ubi: fastmap: Return error code if memory allocation fails in add_aeb() maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Abort fastmap scanning and return error code if memory allocation fails in add_aeb(). Otherwise ubi will get wrong peb statistics information after scanning. Fixes: dbb7d2a88d2a7b ("UBI: Add fastmap core") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/mtd/ubi/fastmap.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index 022af59906aa..6b5f1ffd961b 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -468,7 +468,9 @@ static int scan_pool(struct ubi_device *ubi, struct ubi_attach_info *ai, if (err == UBI_IO_FF_BITFLIPS) scrub = 1; - add_aeb(ai, free, pnum, ec, scrub); + ret = add_aeb(ai, free, pnum, ec, scrub); + if (ret) + goto out; continue; } else if (err == 0 || err == UBI_IO_BITFLIPS) { dbg_bld("Found non empty PEB:%i in pool", pnum); @@ -638,8 +640,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &ai->free, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 0); + ret = add_aeb(ai, &ai->free, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 0); + if (ret) + goto fail; } /* read EC values from used list */ @@ -649,8 +653,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &used, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 0); + ret = add_aeb(ai, &used, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 0); + if (ret) + goto fail; } /* read EC values from scrub list */ @@ -660,8 +666,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &used, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 1); + ret = add_aeb(ai, &used, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 1); + if (ret) + goto fail; } /* read EC values from erase list */ @@ -671,8 +679,10 @@ static int ubi_attach_fastmap(struct ubi_device *ubi, if (fm_pos >= fm_size) goto fail_bad; - add_aeb(ai, &ai->erase, be32_to_cpu(fmec->pnum), - be32_to_cpu(fmec->ec), 1); + ret = add_aeb(ai, &ai->erase, be32_to_cpu(fmec->pnum), + be32_to_cpu(fmec->ec), 1); + if (ret) + goto fail; } ai->mean_ec = div_u64(ai->ec_sum, ai->ec_count); -- Gitee From 83919316f903d4c261169abe19dd5e1cce0e0b3a Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 15 Nov 2021 19:52:24 +0800 Subject: [PATCH 097/134] ubi: fastmap: Add all fastmap pebs into 'ai->fastmap' when fm->used_blocks>=2 maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Fastmap pebs(pnum >= UBI_FM_MAX_START) won't be added into 'ai->fastmap' while attaching ubi device if 'fm->used_blocks' is greater than 2, which may cause warning from 'ubi_assert(ubi->good_peb_count == found_pebs)': UBI assert failed in ubi_wl_init at 1878 (pid 2409) Call Trace: ubi_wl_init.cold+0xae/0x2af [ubi] ubi_attach+0x1b0/0x780 [ubi] ubi_init+0x23a/0x3ad [ubi] load_module+0x22d2/0x2430 Reproduce: ID="0x20,0x33,0x00,0x00" # 16M 16KB PEB, 512 page modprobe nandsim id_bytes=$ID modprobe ubi mtd="0,0" fm_autoconvert # Fastmap takes 2 pebs rmmod ubi modprobe ubi mtd="0,0" fm_autoconvert # Attach by fastmap Add all fastmap pebs into list 'ai->fastmap' to make sure they can be counted into 'found_pebs'. Fixes: fdf10ed710c0aa ("ubi: Rework Fastmap attach base code") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/mtd/ubi/fastmap.c | 41 ++++++++++++++------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index 6b5f1ffd961b..88fdf8f5709f 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -828,24 +828,6 @@ static int find_fm_anchor(struct ubi_attach_info *ai) return ret; } -static struct ubi_ainf_peb *clone_aeb(struct ubi_attach_info *ai, - struct ubi_ainf_peb *old) -{ - struct ubi_ainf_peb *new; - - new = ubi_alloc_aeb(ai, old->pnum, old->ec); - if (!new) - return NULL; - - new->vol_id = old->vol_id; - new->sqnum = old->sqnum; - new->lnum = old->lnum; - new->scrub = old->scrub; - new->copy_flag = old->copy_flag; - - return new; -} - /** * ubi_scan_fastmap - scan the fastmap. * @ubi: UBI device object @@ -875,15 +857,11 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, if (fm_anchor < 0) return UBI_NO_FASTMAP; - /* Copy all (possible) fastmap blocks into our new attach structure. */ + /* Add fastmap blocks(pnum < UBI_FM_MAX_START) into attach structure. */ list_for_each_entry(aeb, &scan_ai->fastmap, u.list) { - struct ubi_ainf_peb *new; - - new = clone_aeb(ai, aeb); - if (!new) - return -ENOMEM; - - list_add(&new->u.list, &ai->fastmap); + ret = add_aeb(ai, &ai->fastmap, aeb->pnum, aeb->ec, 0); + if (ret) + return ret; } down_write(&ubi->fm_protect); @@ -1029,6 +1007,17 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, "err: %i)", i, pnum, ret); goto free_hdr; } + + /* + * Add left fastmap blocks (pnum >= UBI_FM_MAX_START) into + * attach structure. + */ + if (pnum >= UBI_FM_MAX_START) { + ret = add_aeb(ai, &ai->fastmap, pnum, + be64_to_cpu(ech->ec), 0); + if (ret) + goto free_hdr; + } } kfree(fmsb); -- Gitee From 1423dc67e94e9ba71a3120738c171baaa720d7fd Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 15 Nov 2021 19:52:26 +0800 Subject: [PATCH 098/134] ubi: fix race condition between ctrl_cdev_ioctl and ubi_cdev_ioctl maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Hulk Robot reported a KASAN report about use-after-free: ================================================================== BUG: KASAN: use-after-free in __list_del_entry_valid+0x13d/0x160 Read of size 8 at addr ffff888035e37d98 by task ubiattach/1385 [...] Call Trace: klist_dec_and_del+0xa7/0x4a0 klist_put+0xc7/0x1a0 device_del+0x4d4/0xed0 cdev_device_del+0x1a/0x80 ubi_attach_mtd_dev+0x2951/0x34b0 [ubi] ctrl_cdev_ioctl+0x286/0x2f0 [ubi] Allocated by task 1414: device_add+0x60a/0x18b0 cdev_device_add+0x103/0x170 ubi_create_volume+0x1118/0x1a10 [ubi] ubi_cdev_ioctl+0xb7f/0x1ba0 [ubi] Freed by task 1385: cdev_device_del+0x1a/0x80 ubi_remove_volume+0x438/0x6c0 [ubi] ubi_cdev_ioctl+0xbf4/0x1ba0 [ubi] [...] ================================================================== The lock held by ctrl_cdev_ioctl is ubi_devices_mutex, but the lock held by ubi_cdev_ioctl is ubi->device_mutex. Therefore, the two locks can be concurrent. ctrl_cdev_ioctl contains two operations: ubi_attach and ubi_detach. ubi_detach is bug-free because it uses reference counting to prevent concurrency. However, uif_init and uif_close in ubi_attach may race with ubi_cdev_ioctl. uif_init will race with ubi_cdev_ioctl as in the following stack. cpu1 cpu2 cpu3 _______________________|________________________|______________________ ctrl_cdev_ioctl ubi_attach_mtd_dev uif_init ubi_cdev_ioctl ubi_create_volume cdev_device_add ubi_add_volume // sysfs exist kill_volumes ubi_cdev_ioctl ubi_remove_volume cdev_device_del // first free ubi_free_volume cdev_del // double free cdev_device_del And uif_close will race with ubi_cdev_ioctl as in the following stack. cpu1 cpu2 cpu3 _______________________|________________________|______________________ ctrl_cdev_ioctl ubi_attach_mtd_dev uif_init ubi_cdev_ioctl ubi_create_volume cdev_device_add ubi_debugfs_init_dev //error goto out_uif; uif_close kill_volumes ubi_cdev_ioctl ubi_remove_volume cdev_device_del // first free ubi_free_volume // double free The cause of this problem is that commit 714fb87e8bc0 make device "available" before it becomes accessible via sysfs. Therefore, we roll back the modification. We will fix the race condition between ubi device creation and udev by removing ubi_get_device in vol_attribute_show and dev_attribute_show.This avoids accessing uninitialized ubi_devices[ubi_num]. ubi_get_device is used to prevent devices from being deleted during sysfs execution. However, now kernfs ensures that devices will not be deleted before all reference counting are released. The key process is shown in the following stack. device_del device_remove_attrs device_remove_groups sysfs_remove_groups sysfs_remove_group remove_files kernfs_remove_by_name kernfs_remove_by_name_ns __kernfs_remove kernfs_drain Fixes: 714fb87e8bc0 ("ubi: Fix race condition between ubi device creation and udev") Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Hou Tao Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/mtd/ubi/build.c | 9 +-------- drivers/mtd/ubi/vmt.c | 8 +------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index e85b04e9716b..4153e0d15c5f 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -350,9 +350,6 @@ static ssize_t dev_attribute_show(struct device *dev, * we still can use 'ubi->ubi_num'. */ ubi = container_of(dev, struct ubi_device, dev); - ubi = ubi_get_device(ubi->ubi_num); - if (!ubi) - return -ENODEV; if (attr == &dev_eraseblock_size) ret = sprintf(buf, "%d\n", ubi->leb_size); @@ -381,7 +378,6 @@ static ssize_t dev_attribute_show(struct device *dev, else ret = -EINVAL; - ubi_put_device(ubi); return ret; } @@ -980,9 +976,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, goto out_detach; } - /* Make device "available" before it becomes accessible via sysfs */ - ubi_devices[ubi_num] = ubi; - err = uif_init(ubi); if (err) goto out_detach; @@ -1027,6 +1020,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); + ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; @@ -1035,7 +1029,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, out_uif: uif_close(ubi); out_detach: - ubi_devices[ubi_num] = NULL; ubi_wl_close(ubi); ubi_free_all_volumes(ubi); vfree(ubi->vtbl); diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 139ee132bfbc..1bc7b3a05604 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -56,16 +56,11 @@ static ssize_t vol_attribute_show(struct device *dev, { int ret; struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); - struct ubi_device *ubi; - - ubi = ubi_get_device(vol->ubi->ubi_num); - if (!ubi) - return -ENODEV; + struct ubi_device *ubi = vol->ubi; spin_lock(&ubi->volumes_lock); if (!ubi->volumes[vol->vol_id]) { spin_unlock(&ubi->volumes_lock); - ubi_put_device(ubi); return -ENODEV; } /* Take a reference to prevent volume removal */ @@ -103,7 +98,6 @@ static ssize_t vol_attribute_show(struct device *dev, vol->ref_count -= 1; ubi_assert(vol->ref_count >= 0); spin_unlock(&ubi->volumes_lock); - ubi_put_device(ubi); return ret; } -- Gitee From b41bd3d6257214d31b5cb2defed92679b58f29ec Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 15 Nov 2021 19:52:27 +0800 Subject: [PATCH 099/134] regulator: core: do not continue if selector match mainline inclusion from mainline-v5.11-rc1 commit ab97800e088acf34d0014845ed93605dd5c1ea2a category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ----------------------------------------------- Do not continue if selector has already been located. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/1605290164-11556-1-git-send-email-claudiu.beznea@microchip.com Signed-off-by: Mark Brown Signed-off-by: Guan Jing Reviewed-by: Chen Hui Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/regulator/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 043b5f63b94a..eead157609e5 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -4058,6 +4058,10 @@ int regulator_set_voltage_time(struct regulator *regulator, for (i = 0; i < rdev->desc->n_voltages; i++) { /* We only look for exact voltage matches here */ + + if (old_sel >= 0 && new_sel >= 0) + break; + voltage = regulator_list_voltage(regulator, i); if (voltage < 0) return -EINVAL; -- Gitee From 0a61e109955311a3f6cd6da224cbfa993a5b842c Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 15 Nov 2021 19:52:28 +0800 Subject: [PATCH 100/134] regulator: core: Respect off_on_delay at startup mainline inclusion from mainline-v5.13-rc1 commit a5ccccb3ec0b052804d03df90c0d08689be54170 category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ----------------------------------------------- We currently do not respect off_on_delay the first time we turn on a regulator. This is problematic since the regulator could have been turned off by the bootloader, or it could it have been turned off during the probe of the regulator driver (such as when regulator-fixed requests the enable GPIO), either of which could potentially have happened less than off_on_delay microseconds ago before the first time a client requests for the regulator to be turned on. We can't know exactly when the regulator was turned off, but initialise off_on_delay to the current time when registering the regulator, so that we guarantee that we respect the off_on_delay in all cases. Signed-off-by: Vincent Whitchurch Link: https://lore.kernel.org/r/20210422083044.11479-1-vincent.whitchurch@axis.com Signed-off-by: Mark Brown Signed-off-by: Guan Jing Reviewed-by: Chen Hui Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/regulator/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index eead157609e5..a6d27334a71d 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1445,6 +1445,8 @@ static int set_machine_constraints(struct regulator_dev *rdev) if (rdev->constraints->always_on) rdev->use_count++; + } else if (rdev->desc->off_on_delay) { + rdev->last_off_jiffy = jiffies; } print_constraints(rdev); -- Gitee From e05e08ee3100ba69d005446779f2cef261f6b0d4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 15 Nov 2021 19:52:29 +0800 Subject: [PATCH 101/134] iio: core: Allow drivers to specify a label without it coming from of mainline inclusion from mainline-v5.13-rc1 commit 6f71bf1991b6f04dc87a4f5b9d6823535f51a50d category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ----------------------------------------------- Only set indio_dev->label from of/dt if there actually is a label specified in of. This allows drivers to set a label without this being overwritten with NULL when there is no label specified in of. This is esp. useful on devices where of is not used at all, such as your typical x86/ACPI device. Signed-off-by: Hans de Goede Reviewed-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20210207160901.110643-2-hdegoede@redhat.com Signed-off-by: Jonathan Cameron Signed-off-by: tanghui Reviewed-by: Zhang Qiao Reviewed-by: Chen Hui Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/iio/industrialio-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 261d3b17edc9..ea98aad9fb81 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1720,6 +1720,7 @@ static const struct iio_buffer_setup_ops noop_ring_setup_ops; int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod) { + const char *label; int ret; if (!indio_dev->info) @@ -1730,8 +1731,9 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod) if (!indio_dev->dev.of_node && indio_dev->dev.parent) indio_dev->dev.of_node = indio_dev->dev.parent->of_node; - indio_dev->label = of_get_property(indio_dev->dev.of_node, "label", - NULL); + label = of_get_property(indio_dev->dev.of_node, "label", NULL); + if (label) + indio_dev->label = label; ret = iio_check_unique_scan_index(indio_dev); if (ret < 0) -- Gitee From d5f3427bbd432b2a985c0e03bc103a7edd282aec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Mon, 15 Nov 2021 19:52:30 +0800 Subject: [PATCH 102/134] iio: buffer: Return error if no callback is given MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.11-rc1 commit 6d74a3ee1ee1c7b62de656c26d370448ed5885c3 category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ----------------------------------------------- Return error in case no callback is provided to `iio_channel_get_all_cb()`. There's no point in setting up a buffer-cb if no callback is provided. Signed-off-by: Nuno Sá Reviewed-by: Olivier Moysan Link: https://lore.kernel.org/r/20201121161457.957-3-nuno.sa@analog.com Signed-off-by: Jonathan Cameron Signed-off-by: tanghui Reviewed-by: Zhang Qiao Reviewed-by: Chen Hui Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/iio/buffer/industrialio-buffer-cb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iio/buffer/industrialio-buffer-cb.c b/drivers/iio/buffer/industrialio-buffer-cb.c index 47c96f7f4976..4c12b7a94af5 100644 --- a/drivers/iio/buffer/industrialio-buffer-cb.c +++ b/drivers/iio/buffer/industrialio-buffer-cb.c @@ -54,6 +54,11 @@ struct iio_cb_buffer *iio_channel_get_all_cb(struct device *dev, struct iio_cb_buffer *cb_buff; struct iio_channel *chan; + if (!cb) { + dev_err(dev, "Invalid arguments: A callback must be provided!\n"); + return ERR_PTR(-EINVAL); + } + cb_buff = kzalloc(sizeof(*cb_buff), GFP_KERNEL); if (cb_buff == NULL) return ERR_PTR(-ENOMEM); -- Gitee From b4ee88f06395e3cb1d6c14593a162fcfdb9c7929 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 15 Nov 2021 19:53:30 +0800 Subject: [PATCH 103/134] blk-mq: don't free tags if the tag_set is used by other device in queue initialztion mainline inclusion from mainline-v5.16 commit a846a8e6c9a5949582c5a6a8bbc83a7d27fd891e category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a846a8e6c9a5949582c5a6a8bbc83a7d27fd891e Signed-off-by: Yu Changchun ----------------------------------------------- We got UAF report on v5.10 as follows: [ 1446.674930] ================================================================== [ 1446.675970] BUG: KASAN: use-after-free in blk_mq_get_driver_tag+0x9a4/0xa90 [ 1446.676902] Read of size 8 at addr ffff8880185afd10 by task kworker/1:2/12348 [ 1446.677851] [ 1446.678073] CPU: 1 PID: 12348 Comm: kworker/1:2 Not tainted 5.10.0-10177-gc9c81b1e346a #2 [ 1446.679168] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1446.680692] Workqueue: kthrotld blk_throtl_dispatch_work_fn [ 1446.681448] Call Trace: [ 1446.681800] dump_stack+0x9b/0xce [ 1446.682916] print_address_description.constprop.6+0x3e/0x60 [ 1446.685999] kasan_report.cold.9+0x22/0x3a [ 1446.687186] blk_mq_get_driver_tag+0x9a4/0xa90 [ 1446.687785] blk_mq_dispatch_rq_list+0x21a/0x1d40 [ 1446.692576] __blk_mq_do_dispatch_sched+0x394/0x830 [ 1446.695758] __blk_mq_sched_dispatch_requests+0x398/0x4f0 [ 1446.698279] blk_mq_sched_dispatch_requests+0xdf/0x140 [ 1446.698967] __blk_mq_run_hw_queue+0xc0/0x270 [ 1446.699561] __blk_mq_delay_run_hw_queue+0x4cc/0x550 [ 1446.701407] blk_mq_run_hw_queue+0x13b/0x2b0 [ 1446.702593] blk_mq_sched_insert_requests+0x1de/0x390 [ 1446.703309] blk_mq_flush_plug_list+0x4b4/0x760 [ 1446.705408] blk_flush_plug_list+0x2c5/0x480 [ 1446.708471] blk_finish_plug+0x55/0xa0 [ 1446.708980] blk_throtl_dispatch_work_fn+0x23b/0x2e0 [ 1446.711236] process_one_work+0x6d4/0xfe0 [ 1446.711778] worker_thread+0x91/0xc80 [ 1446.713400] kthread+0x32d/0x3f0 [ 1446.714362] ret_from_fork+0x1f/0x30 [ 1446.714846] [ 1446.715062] Allocated by task 1: [ 1446.715509] kasan_save_stack+0x19/0x40 [ 1446.716026] __kasan_kmalloc.constprop.1+0xc1/0xd0 [ 1446.716673] blk_mq_init_tags+0x6d/0x330 [ 1446.717207] blk_mq_alloc_rq_map+0x50/0x1c0 [ 1446.717769] __blk_mq_alloc_map_and_request+0xe5/0x320 [ 1446.718459] blk_mq_alloc_tag_set+0x679/0xdc0 [ 1446.719050] scsi_add_host_with_dma.cold.3+0xa0/0x5db [ 1446.719736] virtscsi_probe+0x7bf/0xbd0 [ 1446.720265] virtio_dev_probe+0x402/0x6c0 [ 1446.720808] really_probe+0x276/0xde0 [ 1446.721320] driver_probe_device+0x267/0x3d0 [ 1446.721892] device_driver_attach+0xfe/0x140 [ 1446.722491] __driver_attach+0x13a/0x2c0 [ 1446.723037] bus_for_each_dev+0x146/0x1c0 [ 1446.723603] bus_add_driver+0x3fc/0x680 [ 1446.724145] driver_register+0x1c0/0x400 [ 1446.724693] init+0xa2/0xe8 [ 1446.725091] do_one_initcall+0x9e/0x310 [ 1446.725626] kernel_init_freeable+0xc56/0xcb9 [ 1446.726231] kernel_init+0x11/0x198 [ 1446.726714] ret_from_fork+0x1f/0x30 [ 1446.727212] [ 1446.727433] Freed by task 26992: [ 1446.727882] kasan_save_stack+0x19/0x40 [ 1446.728420] kasan_set_track+0x1c/0x30 [ 1446.728943] kasan_set_free_info+0x1b/0x30 [ 1446.729517] __kasan_slab_free+0x111/0x160 [ 1446.730084] kfree+0xb8/0x520 [ 1446.730507] blk_mq_free_map_and_requests+0x10b/0x1b0 [ 1446.731206] blk_mq_realloc_hw_ctxs+0x8cb/0x15b0 [ 1446.731844] blk_mq_init_allocated_queue+0x374/0x1380 [ 1446.732540] blk_mq_init_queue_data+0x7f/0xd0 [ 1446.733155] scsi_mq_alloc_queue+0x45/0x170 [ 1446.733730] scsi_alloc_sdev+0x73c/0xb20 [ 1446.734281] scsi_probe_and_add_lun+0x9a6/0x2d90 [ 1446.734916] __scsi_scan_target+0x208/0xc50 [ 1446.735500] scsi_scan_channel.part.3+0x113/0x170 [ 1446.736149] scsi_scan_host_selected+0x25a/0x360 [ 1446.736783] store_scan+0x290/0x2d0 [ 1446.737275] dev_attr_store+0x55/0x80 [ 1446.737782] sysfs_kf_write+0x132/0x190 [ 1446.738313] kernfs_fop_write_iter+0x319/0x4b0 [ 1446.738921] new_sync_write+0x40e/0x5c0 [ 1446.739429] vfs_write+0x519/0x720 [ 1446.739877] ksys_write+0xf8/0x1f0 [ 1446.740332] do_syscall_64+0x2d/0x40 [ 1446.740802] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1446.741462] [ 1446.741670] The buggy address belongs to the object at ffff8880185afd00 [ 1446.741670] which belongs to the cache kmalloc-256 of size 256 [ 1446.743276] The buggy address is located 16 bytes inside of [ 1446.743276] 256-byte region [ffff8880185afd00, ffff8880185afe00) [ 1446.744765] The buggy address belongs to the page: [ 1446.745416] page:ffffea0000616b00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x185ac [ 1446.746694] head:ffffea0000616b00 order:2 compound_mapcount:0 compound_pincount:0 [ 1446.747719] flags: 0x1fffff80010200(slab|head) [ 1446.748337] raw: 001fffff80010200 ffffea00006a3208 ffffea000061bf08 ffff88801004f240 [ 1446.749404] raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 [ 1446.750455] page dumped because: kasan: bad access detected [ 1446.751227] [ 1446.751445] Memory state around the buggy address: [ 1446.752102] ffff8880185afc00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1446.753090] ffff8880185afc80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1446.754079] >ffff8880185afd00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1446.755065] ^ [ 1446.755589] ffff8880185afd80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1446.756574] ffff8880185afe00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1446.757566] ================================================================== Flag 'BLK_MQ_F_TAG_QUEUE_SHARED' will be set if the second device on the same host initializes it's queue successfully. However, if the second device failed to allocate memory in blk_mq_alloc_and_init_hctx() from blk_mq_realloc_hw_ctxs() from blk_mq_init_allocated_queue(), __blk_mq_free_map_and_rqs() will be called on error path, and if 'BLK_MQ_TAG_HCTX_SHARED' is not set, 'tag_set->tags' will be freed while it's still used by the first device. To fix this issue we move release newly allocated hardware context from blk_mq_realloc_hw_ctxs to __blk_mq_update_nr_hw_queues. As there is needn't to release hardware context in blk_mq_init_allocated_queue. Fixes: 868f2f0b7206 ("blk-mq: dynamic h/w context count") Signed-off-by: Ye Bin Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20211108074019.1058843-1-yebin10@huawei.com Signed-off-by: Jens Axboe conflicts: block/blk-mq.c Signed-off-by: Ye Bin Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-mq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index b0e7ec85a41c..70301cc6b18d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3239,8 +3239,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) - blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } @@ -3726,8 +3724,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { + int i = prev_nr_hw_queues; + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); + for (; i < set->nr_hw_queues; i++) + blk_mq_free_map_and_requests(set, i); + set->nr_hw_queues = prev_nr_hw_queues; blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; -- Gitee From 9b4f9af2848894f11d76a5f7d2b7b9378a247c33 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Nov 2021 19:53:31 +0800 Subject: [PATCH 104/134] ARM: 9057/1: cache-v7: add missing ISB after cache level selection mainline inclusion from mainline-v5.15-rc2 commit c0e50736e826b51ddc437e6cf0dc68f07e4ad16b issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0e50736e826b51ddc437e6cf0dc68f07e4ad16b Signed-off-by: Yu Changchun ----------------------------- A write to CSSELR needs to complete before its results can be observed via CCSIDR. So add a ISB to ensure that this is the case. Acked-by: Nicolas Pitre Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King Signed-off-by: He Ying Reviewed-by: Liao Chang Reviewed-by: Li Wei Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- arch/arm/mm/cache-v7.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index dc8f152f3556..307f381eee71 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -38,9 +38,10 @@ icache_size: * procedures. */ ENTRY(v7_invalidate_l1) - mov r0, #0 - mcr p15, 2, r0, c0, c0, 0 - mrc p15, 1, r0, c0, c0, 0 + mov r0, #0 + mcr p15, 2, r0, c0, c0, 0 @ select L1 data cache in CSSELR + isb + mrc p15, 1, r0, c0, c0, 0 @ read cache geometry from CCSIDR movw r1, #0x7fff and r2, r1, r0, lsr #13 -- Gitee From c7cf208e5ef89118d70f3f2ea0cdbb84eef8ee99 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:32 +0800 Subject: [PATCH 105/134] nvme: add APIs for stopping/starting admin queue mainline inclusion from mainline-v5.16 commit a277654bafb51fb8b4cf23550f15926bb02536f4 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a277654bafb51fb8b4cf23550f15926bb02536f4 Signed-off-by: Yu Changchun --------------------------- Add two APIs for stopping and starting admin queue. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-2-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/nvme/host/core.c | 12 ++++++++++++ drivers/nvme/host/nvme.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 99b5152482fe..509a44ff0ce5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4634,6 +4634,18 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +{ + blk_mq_quiesce_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); + +void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +{ + blk_mq_unquiesce_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_start_admin_queue); + void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5dd1dd8021ba..0cad04335e34 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -647,6 +647,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); +void nvme_start_admin_queue(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); -- Gitee From 3f26ab0b99bc2d569db4e4fbc33a7bedafb768f4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:33 +0800 Subject: [PATCH 106/134] nvme: apply nvme API to quiesce/unquiesce admin queue mainline inclusion from mainline-v5.16 commit 6ca1d9027e0d9ce5604a3e28de89456a76138034 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6ca1d9027e0d9ce5604a3e28de89456a76138034 Signed-off-by: Yu Changchun --------------------------- Apply the added two APIs to quiesce/unquiesce admin queue. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-3-ming.lei@redhat.com Signed-off-by: Jens Axboe Conflict: commit f21c4769d0de ("nvme: rename nvme_init_identify()") is not backported - drivers/nvme/host/rdma.c - drivers/nvme/host/tcp.c - drivers/nvme/target/loop.c - drivers/nvme/host/fc.c Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 8 ++++---- drivers/nvme/host/pci.c | 8 ++++---- drivers/nvme/host/rdma.c | 14 +++++++------- drivers/nvme/host/tcp.c | 16 ++++++++-------- drivers/nvme/target/loop.c | 4 ++-- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 509a44ff0ce5..dd425ae68c07 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4555,7 +4555,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) /* Forcibly unquiesce queues to avoid blocking dispatch */ if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); list_for_each_entry(ns, &ctrl->namespaces, list) nvme_set_queue_dying(ns); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 906cab35afe7..b534a85e2bf1 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2381,7 +2381,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); @@ -2509,7 +2509,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) /* * clean up the admin queue. Same thing as above. */ - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -3098,7 +3098,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << (ilog2(SZ_4K) - 9); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); ret = nvme_init_identify(&ctrl->ctrl); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) @@ -3245,7 +3245,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_fc_free_queue(&ctrl->queues[0]); /* re-enable the admin_q so anything new can fast fail */ - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); /* resume the io queues so that things will fast fail */ nvme_start_queues(&ctrl->ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1b85349f57af..224907d8d5dc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1408,7 +1408,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); + nvme_stop_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); return 0; @@ -1640,7 +1640,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1674,7 +1674,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return -ENODEV; } } else - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); return 0; } @@ -2516,7 +2516,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (shutdown) { nvme_start_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); } mutex_unlock(&dev->shutdown_lock); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 51f4647ea214..5e2c86adeb27 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -918,7 +918,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, else ctrl->ctrl.max_integrity_segments = 0; - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); error = nvme_init_identify(&ctrl->ctrl); if (error) @@ -927,7 +927,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return 0; out_quiesce_queue: - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); @@ -1025,7 +1025,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { @@ -1034,7 +1034,7 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); } if (remove) - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -1161,7 +1161,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) nvme_rdma_destroy_io_queues(ctrl, new); } destroy_admin: - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); @@ -1201,7 +1201,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2237,7 +2237,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&ctrl->reconnect_work); nvme_rdma_teardown_io_queues(ctrl, shutdown); - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e99d43989418..c014c5adbac5 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1889,7 +1889,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); error = nvme_init_identify(ctrl); if (error) @@ -1898,7 +1898,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) return 0; out_quiesce_queue: - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); @@ -1920,7 +1920,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); if (ctrl->admin_tagset) { @@ -1929,7 +1929,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); } if (remove) - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1938,7 +1938,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_sync_io_queues(ctrl); @@ -2030,7 +2030,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); @@ -2073,7 +2073,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2091,7 +2091,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); if (shutdown) nvme_shutdown_ctrl(ctrl); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index e17710e405a3..5b8a38c9475e 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -396,7 +396,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); error = nvme_init_identify(&ctrl->ctrl); if (error) @@ -426,7 +426,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_loop_destroy_io_queues(ctrl); } - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); -- Gitee From 99f0a498b96a7262d259450e0b54f5318453950b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:34 +0800 Subject: [PATCH 107/134] nvme: prepare for pairing quiescing and unquiescing mainline inclusion from mainline-v5.16 commit ebc9b95260151d966728cf0063b3b4e465f934d9 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ebc9b95260151d966728cf0063b3b4e465f934d9 Signed-off-by: Yu Changchun --------------------------- Add two helpers so that we can prepare for pairing quiescing and unquiescing which will be done in next patch. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-4-ming.lei@redhat.com Signed-off-by: Jens Axboe conflict: commit d17e66aadbe5 ("nvme: use set_capacity_and_notify in nvme_set_queue_dying") is not backported. - drivers/nvme/host/core.c Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/nvme/host/core.c | 54 ++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dd425ae68c07..cdea98169131 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,26 +102,6 @@ static void nvme_update_bdev_size(struct gendisk *disk) } } -/* - * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. - */ -static void nvme_set_queue_dying(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - return; - - blk_set_queue_dying(ns->queue); - blk_mq_unquiesce_queue(ns->queue); - - set_capacity(ns->disk, 0); - nvme_update_bdev_size(ns->disk); -} - static void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* @@ -4540,6 +4520,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, } EXPORT_SYMBOL_GPL(nvme_init_ctrl); +static void nvme_start_ns_queue(struct nvme_ns *ns) +{ + blk_mq_unquiesce_queue(ns->queue); +} + +static void nvme_stop_ns_queue(struct nvme_ns *ns) +{ + blk_mq_quiesce_queue(ns->queue); +} + +/* + * Prepare a queue for teardown. + * + * This must forcibly unquiesce queues to avoid blocking dispatch, and only set + * the capacity to 0 after that to avoid blocking dispatchers that may be + * holding bd_butex. This will end buffered writers dirtying pages that can't + * be synced. + */ +static void nvme_set_queue_dying(struct nvme_ns *ns) +{ + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + return; + + blk_set_queue_dying(ns->queue); + nvme_start_ns_queue(ns); + + set_capacity(ns->disk, 0); + nvme_update_bdev_size(ns->disk); +} + /** * nvme_kill_queues(): Ends all namespace queues * @ctrl: the dead controller that needs to end @@ -4618,7 +4628,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_quiesce_queue(ns->queue); + nvme_stop_ns_queue(ns); up_read(&ctrl->namespaces_rwsem); } EXPORT_SYMBOL_GPL(nvme_stop_queues); @@ -4629,7 +4639,7 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_unquiesce_queue(ns->queue); + nvme_start_ns_queue(ns); up_read(&ctrl->namespaces_rwsem); } EXPORT_SYMBOL_GPL(nvme_start_queues); -- Gitee From 50ee909a7dbb10778919b27a91bb05e8d7ad0f8d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:35 +0800 Subject: [PATCH 108/134] nvme: paring quiesce/unquiesce mainline inclusion from mainline-v5.16 commit 9e6a6b1212100148c109675e003369e3e219dbd9 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9e6a6b1212100148c109675e003369e3e219dbd9 Signed-off-by: Yu Changchun --------------------------- The current blk_mq_quiesce_queue() and blk_mq_unquiesce_queue() always stops and starts the queue unconditionally. And there can be concurrent quiesce/unquiesce coming from different unrelated code paths, so unquiesce may come unexpectedly and start queue too early. Prepare for supporting concurrent quiesce/unquiesce from multiple contexts, so that we can address the above issue. NVMe has very complicated quiesce/unquiesce use pattern, add one atomic bit for makeiing sure that blk-mq quiece/unquiesce is always called in pair. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-5-ming.lei@redhat.com Signed-off-by: Jens Axboe Conflict: commit 8c4dfea97f15 ("nvme-fabrics: reject I/O to offline device") is not backported, struct nvme_ctrl doesn't have member flags yet. The commit is a feature and contains a lot of code changes, thus introduce the new member in this patch. - drivers/nvme/host/nvme.h Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/nvme/host/core.c | 12 ++++++++---- drivers/nvme/host/nvme.h | 3 +++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cdea98169131..9ccf44592fe4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4522,12 +4522,14 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl); static void nvme_start_ns_queue(struct nvme_ns *ns) { - blk_mq_unquiesce_queue(ns->queue); + if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_unquiesce_queue(ns->queue); } static void nvme_stop_ns_queue(struct nvme_ns *ns) { - blk_mq_quiesce_queue(ns->queue); + if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_quiesce_queue(ns->queue); } /* @@ -4646,13 +4648,15 @@ EXPORT_SYMBOL_GPL(nvme_start_queues); void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) { - blk_mq_quiesce_queue(ctrl->admin_q); + if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_quiesce_queue(ctrl->admin_q); } EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); void nvme_start_admin_queue(struct nvme_ctrl *ctrl) { - blk_mq_unquiesce_queue(ctrl->admin_q); + if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_unquiesce_queue(ctrl->admin_q); } EXPORT_SYMBOL_GPL(nvme_start_admin_queue); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0cad04335e34..ad46208eac76 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -338,6 +338,8 @@ struct nvme_ctrl { u16 icdoff; u16 maxcmd; int nr_reconnects; + unsigned long flags; +#define NVME_CTRL_ADMIN_Q_STOPPED 0 struct nvmf_ctrl_options *opts; struct page *discard_page; @@ -449,6 +451,7 @@ struct nvme_ns { #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 +#define NVME_NS_STOPPED 3 struct nvme_fault_inject fault_inject; -- Gitee From 754a13e02ef4101a935e337c9166fc308f184d04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:36 +0800 Subject: [PATCH 109/134] nvme: loop: clear NVME_CTRL_ADMIN_Q_STOPPED after admin queue is reallocated mainline inclusion from mainline-v5.16 commit 1d35d519d8bf224ccdb43f9a235b8bda2d6d453c category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1d35d519d8bf224ccdb43f9a235b8bda2d6d453c Signed-off-by: Yu Changchun --------------------------- The nvme-loop's admin queue may be freed and reallocated, and we have to reset the flag of NVME_CTRL_ADMIN_Q_STOPPED so that the flag can match with the quiesce state of the admin queue. nvme-loop is the only driver to reallocate request queue, and not see such usage in other nvme drivers. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-6-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/nvme/target/loop.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 5b8a38c9475e..a05dc29a43f9 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -382,6 +382,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = PTR_ERR(ctrl->ctrl.admin_q); goto out_cleanup_fabrics_q; } + /* reset stopped state for the fresh admin queue */ + clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); error = nvmf_connect_admin_queue(&ctrl->ctrl); if (error) -- Gitee From f126f406f47346e8ddb5e22572750667e7feefc5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:37 +0800 Subject: [PATCH 110/134] blk-mq: support concurrent queue quiesce/unquiesce mainline inclusion from mainline-v5.16 commit e70feb8b3e6886c525c88943b5f1508d02f5a683 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e70feb8b3e6886c525c88943b5f1508d02f5a683 Signed-off-by: Yu Changchun --------------------------- blk_mq_quiesce_queue() has been used a bit wide now, so far we don't support concurrent/nested quiesce. One biggest issue is that unquiesce can happen unexpectedly in case that quiesce/unquiesce are run concurrently from more than one context. This patch introduces q->mq_quiesce_depth to deal concurrent quiesce, and we only unquiesce queue when it is the last/outer-most one of all contexts. Several kernel panic issue has been reported[1][2][3] when running stress quiesce test. And this patch has been verified in these reports. [1] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m1fc52431fad7f33b1ffc3f12c4450e4238540787 [2] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m10ad90afeb9c8cc318334190a7c24c8b5c5e0722 [3] https://listman.redhat.com/archives/dm-devel/2021-September/msg00189.html Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-7-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-mq.c | 22 +++++++++++++++++++--- include/linux/blkdev.h | 2 ++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 70301cc6b18d..1af4f9bbc0fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -206,7 +206,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + if (!q->quiesce_depth++) + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); @@ -247,10 +252,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + bool run_queue = false; + + spin_lock_irqsave(&q->queue_lock, flags); + if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { + ; + } else if (!--q->quiesce_depth) { + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + run_queue = true; + } + spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + if (run_queue) + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7517381b5e15..d6ae309a7dc3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,8 @@ struct request_queue { */ struct mutex mq_freeze_lock; + int quiesce_depth; + struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set bio_split; -- Gitee From 85442ae4e5f794069953506f1b867908beeca72e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:38 +0800 Subject: [PATCH 111/134] dm: don't stop request queue after the dm device is suspended mainline inclusion from mainline-v5.16 commit a1c2f7e7f25c9d35d3bf046f99682c5373b20fa2 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a1c2f7e7f25c9d35d3bf046f99682c5373b20fa2 Signed-off-by: Yu Changchun --------------------------- For fixing queue quiesce race between driver and block layer(elevator switch, update nr_requests, ...), we need to support concurrent quiesce and unquiesce, which requires the two call to be balanced. __bind() is only called from dm_swap_table() in which dm device has been suspended already, so not necessary to stop queue again. With this way, request queue quiesce and unquiesce can be balanced. Reported-by: Yi Zhang Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce") Signed-off-by: Ming Lei Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/md/dm.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 19a70f434029..3403722b1688 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2038,16 +2038,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, dm_table_event_callback(t, event_callback, md); - /* - * The queue hasn't been stopped yet, if the old table type wasn't - * for request-based during suspension. So stop it to prevent - * I/O mapping before resume. - * This must be done before setting the queue restrictions, - * because request-based dm may be run just after the setting. - */ - if (request_based) - dm_stop_queue(q); - if (request_based) { /* * Leverage the fact that request-based DM targets are -- Gitee From b41c4d926dafdf0943ec6944345ef4216dddef3d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:39 +0800 Subject: [PATCH 112/134] scsi: avoid to quiesce sdev->request_queue two times maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- For fixing queue quiesce race between driver and block layer(elevator switch, update nr_requests, ...), we need to support concurrent quiesce and unquiesce, which requires the two to be balanced. blk_mq_quiesce_queue() calls blk_mq_quiesce_queue_nowait() for updating quiesce depth and marking the flag, then scsi_internal_device_block() calls blk_mq_quiesce_queue_nowait() two times actually. Fix the double quiesce and keep quiesce and unquiesce balanced. Reported-by: Yi Zhang Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce") Signed-off-by: Ming Lei Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/scsi/scsi_lib.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b26c922bd65d..2561d48dfade 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2618,6 +2618,14 @@ scsi_target_resume(struct scsi_target *starget) } EXPORT_SYMBOL(scsi_target_resume); +static int __scsi_internal_device_block_nowait(struct scsi_device *sdev) +{ + if (scsi_device_set_state(sdev, SDEV_BLOCK)) + return scsi_device_set_state(sdev, SDEV_CREATED_BLOCK); + + return 0; +} + /** * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state * @sdev: device to block @@ -2634,24 +2642,16 @@ EXPORT_SYMBOL(scsi_target_resume); */ int scsi_internal_device_block_nowait(struct scsi_device *sdev) { - struct request_queue *q = sdev->request_queue; - int err = 0; - - err = scsi_device_set_state(sdev, SDEV_BLOCK); - if (err) { - err = scsi_device_set_state(sdev, SDEV_CREATED_BLOCK); - - if (err) - return err; - } + int ret = __scsi_internal_device_block_nowait(sdev); /* * The device has transitioned to SDEV_BLOCK. Stop the * block layer from calling the midlayer with this device's * request queue. */ - blk_mq_quiesce_queue_nowait(q); - return 0; + if (!ret) + blk_mq_quiesce_queue_nowait(sdev->request_queue); + return ret; } EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); @@ -2672,13 +2672,12 @@ EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); */ static int scsi_internal_device_block(struct scsi_device *sdev) { - struct request_queue *q = sdev->request_queue; int err; mutex_lock(&sdev->state_mutex); - err = scsi_internal_device_block_nowait(sdev); + err = __scsi_internal_device_block_nowait(sdev); if (err == 0) - blk_mq_quiesce_queue(q); + blk_mq_quiesce_queue(sdev->request_queue); mutex_unlock(&sdev->state_mutex); return err; -- Gitee From 573b1d2ad4c3e381a7cb60e4b9def44201097bb6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 15 Nov 2021 19:53:40 +0800 Subject: [PATCH 113/134] scsi: make sure that request queue queiesce and unquiesce balanced maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- For fixing queue quiesce race between driver and block layer(elevator switch, update nr_requests, ...), we need to support concurrent quiesce and unquiesce, which requires the two call balanced. It isn't easy to audit that in all scsi drivers, especially the two may be called from different contexts, so do it in scsi core with one per-device bit flag & global spinlock, basically zero cost since request queue quiesce is seldom triggered. Reported-by: Yi Zhang Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce") Signed-off-by: Ming Lei Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/scsi/scsi_lib.c | 45 ++++++++++++++++++++++++++++++-------- include/scsi/scsi_device.h | 1 + 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 2561d48dfade..9582aabf2ff1 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2626,6 +2626,40 @@ static int __scsi_internal_device_block_nowait(struct scsi_device *sdev) return 0; } +static DEFINE_SPINLOCK(sdev_queue_stop_lock); + +void scsi_start_queue(struct scsi_device *sdev) +{ + bool need_start; + unsigned long flags; + + spin_lock_irqsave(&sdev_queue_stop_lock, flags); + need_start = sdev->queue_stopped; + sdev->queue_stopped = 0; + spin_unlock_irqrestore(&sdev_queue_stop_lock, flags); + + if (need_start) + blk_mq_unquiesce_queue(sdev->request_queue); +} + +static void scsi_stop_queue(struct scsi_device *sdev, bool nowait) +{ + bool need_stop; + unsigned long flags; + + spin_lock_irqsave(&sdev_queue_stop_lock, flags); + need_stop = !sdev->queue_stopped; + sdev->queue_stopped = 1; + spin_unlock_irqrestore(&sdev_queue_stop_lock, flags); + + if (need_stop) { + if (nowait) + blk_mq_quiesce_queue_nowait(sdev->request_queue); + else + blk_mq_quiesce_queue(sdev->request_queue); + } +} + /** * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state * @sdev: device to block @@ -2650,7 +2684,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev) * request queue. */ if (!ret) - blk_mq_quiesce_queue_nowait(sdev->request_queue); + scsi_stop_queue(sdev, true); return ret; } EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); @@ -2677,19 +2711,12 @@ static int scsi_internal_device_block(struct scsi_device *sdev) mutex_lock(&sdev->state_mutex); err = __scsi_internal_device_block_nowait(sdev); if (err == 0) - blk_mq_quiesce_queue(sdev->request_queue); + scsi_stop_queue(sdev, false); mutex_unlock(&sdev->state_mutex); return err; } -void scsi_start_queue(struct scsi_device *sdev) -{ - struct request_queue *q = sdev->request_queue; - - blk_mq_unquiesce_queue(q); -} - /** * scsi_internal_device_unblock_nowait - resume a device after a block request * @sdev: device to resume diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 1a5c9a3df6d6..2b8c3df03f13 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -204,6 +204,7 @@ struct scsi_device { unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */ unsigned rpm_autosuspend:1; /* Enable runtime autosuspend at device * creation time */ + unsigned queue_stopped:1; /* request queue is quiesced */ bool offline_already; /* Device offline message logged */ -- Gitee From a7c2894c8f4e306e184d908b6ad7dbd45a706fbc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 15 Nov 2021 19:53:41 +0800 Subject: [PATCH 114/134] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode mainline inclusion from mainline-v5.15-rc2 commit 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Signed-off-by: Yu Changchun ----------------------------- Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated in bd1060a1d671. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net Conflicts: include/linux/cgroup-defs.h net/core/netclassid_cgroup.c net/core/netprio_cgroup.c Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- include/linux/cgroup-defs.h | 107 +++++++++-------------------------- include/linux/cgroup.h | 22 +------ kernel/cgroup/cgroup.c | 50 ++++------------ net/core/netclassid_cgroup.c | 7 +-- net/core/netprio_cgroup.c | 11 +--- 5 files changed, 42 insertions(+), 155 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ffec16930b00..97ce0a12367c 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -766,107 +766,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * - * On legacy hierarchies, net_prio and net_cls controllers directly set - * attributes on each sock which can then be tested by the network layer. - * On the default hierarchy, each sock is associated with the cgroup it was - * created in and the networking layer can match the cgroup directly. - * - * To avoid carrying all three cgroup related fields separately in sock, - * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. - * On boot, sock_cgroup_data records the cgroup that the sock was created - * in so that cgroup2 matches can be made; however, once either net_prio or - * net_cls starts being used, the area is overriden to carry prioidx and/or - * classid. The two modes are distinguished by whether the lowest bit is - * set. Clear bit indicates cgroup pointer while set bit prioidx and - * classid. - * - * While userland may start using net_prio or net_cls at any time, once - * either is used, cgroup2 matching no longer works. There is no reason to - * mix the two and this is in line with how legacy and v2 compatibility is - * handled. On mode switch, cgroup references which are already being - * pointed to by socks may be leaked. While this can be remedied by adding - * synchronization around sock_cgroup_data, given that the number of leaked - * cgroups is bound and highly unlikely to be high, this seems to be the - * better trade-off. + * On legacy hierarchies, net_prio and net_cls controllers directly + * set attributes on each sock which can then be tested by the network + * layer. On the default hierarchy, each sock is associated with the + * cgroup it was created in and the networking layer can match the + * cgroup directly. */ struct sock_cgroup_data { - union { -#ifdef __LITTLE_ENDIAN - struct { - u8 is_data : 1; - u8 no_refcnt : 1; - u8 unused : 6; - u8 padding; - u16 prioidx; - u32 classid; - } __packed; -#else - struct { - u32 classid; - u16 prioidx; - u8 padding; - u8 unused : 6; - u8 no_refcnt : 1; - u8 is_data : 1; - } __packed; + struct cgroup *cgroup; /* v2 */ +#ifdef CONFIG_CGROUP_NET_CLASSID + u32 classid; /* v1 */ +#endif +#ifdef CONFIG_CGROUP_NET_PRIO + u16 prioidx; /* v1 */ #endif - u64 val; - }; }; -/* - * There's a theoretical window where the following accessors race with - * updaters and return part of the previous pointer as the prioidx or - * classid. Such races are short-lived and the result isn't critical. - */ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { - /* fallback to 1 which is always the ID of the root cgroup */ - return (skcd->is_data & 1) ? skcd->prioidx : 1; +#ifdef CONFIG_CGROUP_NET_PRIO + return READ_ONCE(skcd->prioidx); +#else + return 1; +#endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { - /* fallback to 0 which is the unconfigured default classid */ - return (skcd->is_data & 1) ? skcd->classid : 0; +#ifdef CONFIG_CGROUP_NET_CLASSID + return READ_ONCE(skcd->classid); +#else + return 0; +#endif } -/* - * If invoked concurrently, the updaters may clobber each other. The - * caller is responsible for synchronization. - */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_prioidx(&skcd_buf) == prioidx) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.prioidx = prioidx; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_PRIO + WRITE_ONCE(skcd->prioidx, prioidx); +#endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_classid(&skcd_buf) == classid) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.classid = classid; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_CLASSID + WRITE_ONCE(skcd->classid, classid); +#endif } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 618838c48313..9c88b7da3245 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -816,33 +816,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task, */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) -extern spinlock_t cgroup_sk_update_lock; -#endif - -void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - unsigned long v; - - /* - * @skcd->val is 64bit but the following is safe on 32bit too as we - * just need the lower ulong to be written and read atomically. - */ - v = READ_ONCE(skcd->val); - - if (v & 3) - return &cgrp_dfl_root.cgrp; - - return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; -#else - return (struct cgroup *)(unsigned long)skcd->val; -#endif + return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 017bfc698109..e735da6a4d4e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,74 +6439,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) return; rcu_read_lock(); - while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; + skcd->cgroup = cset->dfl_cgrp; cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); } - rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 41b24cd31562..b6de5ee22391 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -72,11 +72,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file, &err); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); - spin_unlock(&cgroup_sk_update_lock); - } if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; @@ -122,8 +119,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, struct css_task_iter it; struct task_struct *p; - cgroup_sk_alloc_disable(); - cs->classid = (u32)value; css_task_iter_start(css, 0, &it); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9bd4cab7d510..34c3e49814e2 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; - cgroup_sk_alloc_disable(); - rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -222,12 +220,11 @@ static int update_netprio(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + + if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); - spin_unlock(&cgroup_sk_update_lock); - } + return 0; } @@ -236,8 +233,6 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct task_struct *p; struct cgroup_subsys_state *css; - cgroup_sk_alloc_disable(); - cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; -- Gitee From 99027e355eb6358cccb3b1b23763ca78e5e99948 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 15 Nov 2021 19:53:42 +0800 Subject: [PATCH 115/134] bpf, cgroup: Assign cgroup in cgroup_sk_alloc when called from interrupt mainline inclusion from mainline-v5.15-rc4 commit 78cc316e9583067884eb8bd154301dc1e9ee945c issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=78cc316e9583067884eb8bd154301dc1e9ee945c Signed-off-by: Yu Changchun ------------------------------- If cgroup_sk_alloc() is called from interrupt context, then just assign the root cgroup to skcd->cgroup. Prior to commit 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode") we would just return, and later on in sock_cgroup_ptr(), we were NULL-testing the cgroup in fast-path, and iff indeed NULL returning the root cgroup (v ?: &cgrp_dfl_root.cgrp). Rather than re-adding the NULL-test to the fast-path we can just assign it once from cgroup_sk_alloc() given v1/v2 handling has been simplified. The migration from NULL test with returning &cgrp_dfl_root.cgrp to assigning &cgrp_dfl_root.cgrp directly does /not/ change behavior for callers of sock_cgroup_ptr(). syzkaller was able to trigger a splat in the legacy netrom code base, where the RX handler in nr_rx_frame() calls nr_make_new() which calls sk_alloc() and therefore cgroup_sk_alloc() with in_interrupt() condition. Thus the NULL skcd->cgroup, where it trips over on cgroup_sk_free() side given it expects a non-NULL object. There are a few other candidates aside from netrom which have similar pattern where in their accept-like implementation, they just call to sk_alloc() and thus cgroup_sk_alloc() instead of sk_clone_lock() with the corresponding cgroup_sk_clone() which then inherits the cgroup from the parent socket. None of them are related to core protocols where BPF cgroup programs are running from. However, in future, they should follow to implement a similar inheritance mechanism. Additionally, with a !CONFIG_CGROUP_NET_PRIO and !CONFIG_CGROUP_NET_CLASSID configuration, the same issue was exposed also prior to 8520e224f547 due to commit e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup") which added the early in_interrupt() return back then. Fixes: 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode") Fixes: e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup") Reported-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com Reported-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com Tested-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210927123921.21535-1-daniel@iogearbox.net Signed-off-by: Lu Jialin Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/cgroup/cgroup.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e735da6a4d4e..350297ad63f1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6441,22 +6441,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } + while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->cgroup = cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } -- Gitee From e1d3894a000df9e757b6112954ad80c629c71112 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Mon, 15 Nov 2021 19:53:43 +0800 Subject: [PATCH 116/134] ubi: fix slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130 maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ------------------------------------------------- When using ioctl interface to resize ubi volume, ubi_resize_volume will resize eba table first, but not change vol->reserved_pebs in the same atomic context which may cause concurrency access eba table. For example, When user do shrink ubi volume A calling ubi_resize_volume, while the other thread is writing (volume B) and triggering wear-leveling, which may calling ubi_write_fastmap, under these circumstances, KASAN may report: slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130. The main work of this patch include: 1. fix races in ubi_resize_volume and ubi_update_fastmap, to avoid eba_tbl read out of bounds. first, we make eba_tbl and reserved_pebs updating under the protect of vol->volumes_lock. second, rollback volume in case of resize failure. Also mention that for volume shrinking failure, since part of volume has been shrunk and unmapped, there is no need to recover {rsvd/avail}_pebs. 2. fix some memleak in error path of ubi_resize_volume when destroy new_eba_tbl. BUG: KASAN: slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130 [ubi] Read of size 4 at addr ffff888013ff7170 by task kworker/u16:3/160 CPU: 6 PID: 160 Comm: kworker/u16:3 Not tainted Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: dump_stack+0x9c/0xcf print_address_description.constprop.0+0x1c/0x220 kasan_report.cold+0x1f/0x37 ubi_eba_get_ldesc+0xfb/0x130 [ubi] ubi_update_fastmap.cold+0x6be/0xc6b [ubi] ubi_wl_get_peb+0x2a2/0x580 [ubi] try_write_vid_and_data+0x9a/0x4d0 [ubi] ubi_eba_write_leb+0x780/0x1890 [ubi] ubi_leb_map+0x197/0x2c0 [ubi] ubifs_leb_map+0x139/0x240 [ubifs] ubifs_add_bud_to_log+0xb02/0xea0 [ubifs] make_reservation+0x860/0xb40 [ubifs] ubifs_jnl_write_data+0x461/0x9b0 [ubifs] do_writepage+0x375/0x550 [ubifs] ubifs_writepage+0x3a4/0x670 [ubifs] __writepage+0x61/0x160 write_cache_pages+0x433/0xb70 do_writepages+0x1ad/0x260 __writeback_single_inode+0xb3/0x810 writeback_sb_inodes+0x4d9/0xcc0 __writeback_inodes_wb+0xc1/0x260 wb_writeback+0x585/0x760 wb_workfn+0x751/0xdb0 process_one_work+0x6e5/0xf10 worker_thread+0x5e1/0x10c0 kthread+0x335/0x400 ret_from_fork+0x1f/0x30 Allocated by task 690: kasan_save_stack+0x1b/0x40 __kasan_kmalloc.constprop.0+0xc2/0xd0 ubi_eba_create_table+0x88/0x1a0 [ubi] ubi_resize_volume.cold+0x166/0xc75 [ubi] ubi_cdev_ioctl+0x57f/0x1aa0 [ubi] __x64_sys_ioctl+0x141/0x1a0 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The following steps can used to reproduce: Process 1: write and trigger ubi wear-leveling ubimkvol /dev/ubi0 -s 5000MiB -N v1 ubimkvol /dev/ubi0 -s 2000MiB -N v2 ubimkvol /dev/ubi0 -s 10MiB -N v3 mount -t ubifs /dev/ubi0_0 /mnt/ubifs while true; do filename=/mnt/ubifs/$((RANDOM)) dd if=/dev/random of=${filename} bs=1M count=$((RANDOM % 1000)) rm -rf ${filename} sync /mnt/ubifs/ done Process 2: do random resize struct ubi_rsvol_req req; req.vol_id = 1; req.bytes = (rand() % 50) * 512KB; ioctl(fd, UBI_IOCRSVOL, &req); Reported-by: Hulk Robot Signed-off-by: Guo Xuenan Reviewed-by: Zhang Xiaoxu Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/mtd/ubi/vmt.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 1bc7b3a05604..04d0e217ea1c 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -409,6 +409,7 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) struct ubi_device *ubi = vol->ubi; struct ubi_vtbl_record vtbl_rec; struct ubi_eba_table *new_eba_tbl = NULL; + struct ubi_eba_table *old_eba_tbl = NULL; int vol_id = vol->vol_id; if (ubi->ro_mode) @@ -454,10 +455,13 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) err = -ENOSPC; goto out_free; } + ubi->avail_pebs -= pebs; ubi->rsvd_pebs += pebs; ubi_eba_copy_table(vol, new_eba_tbl, vol->reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -465,14 +469,16 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) for (i = 0; i < -pebs; i++) { err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i); if (err) - goto out_acc; + goto out_free; } spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs += pebs; ubi->avail_pebs -= pebs; ubi_update_reserved(ubi); ubi_eba_copy_table(vol, new_eba_tbl, reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -494,27 +500,32 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) if (err) goto out_acc; - vol->reserved_pebs = reserved_pebs; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { vol->used_ebs = reserved_pebs; vol->last_eb_bytes = vol->usable_leb_size; vol->used_bytes = (long long)vol->used_ebs * vol->usable_leb_size; } - + /* destroy old table */ + ubi_eba_destroy_table(old_eba_tbl); ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED); self_check_volumes(ubi); return err; out_acc: + spin_lock(&ubi->volumes_lock); + vol->reserved_pebs = reserved_pebs - pebs; if (pebs > 0) { - spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs -= pebs; ubi->avail_pebs += pebs; - spin_unlock(&ubi->volumes_lock); + ubi_eba_copy_table(vol, old_eba_tbl, vol->reserved_pebs); + } else { + ubi_eba_copy_table(vol, old_eba_tbl, reserved_pebs); } + vol->eba_tbl = old_eba_tbl; + spin_unlock(&ubi->volumes_lock); out_free: - kfree(new_eba_tbl); + ubi_eba_destroy_table(new_eba_tbl); return err; } -- Gitee From a65a96a39ce2be562f340d52c8a5146592b4f607 Mon Sep 17 00:00:00 2001 From: Zheng Liang Date: Mon, 15 Nov 2021 19:53:47 +0800 Subject: [PATCH 117/134] squashfs: provides backing_dev_info in order to disable read-ahead maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ------------------------------------------------- the commit c1f6925e1091("mm: put readahead pages in cache earlier") causes the read performance of squashfs to deteriorate.Through testing, we find that the performance will be back by closing the readahead of squashfs. So we want to learn the way of ubifs, provides backing_dev_info and disable read-ahead. Signed-off-by: Zheng Liang Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/squashfs/super.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 88cc94be1076..f7128ae7b949 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -64,6 +65,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } +static int squashfs_bdi_init(struct super_block *sb) +{ + int err; + unsigned int major = MAJOR(sb->s_dev); + unsigned int minor = MINOR(sb->s_dev); + + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + + err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); + if (err) + return err; + + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + + return 0; +} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -78,6 +97,19 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); + /* + * squashfs provides 'backing_dev_info' in order to disable read-ahead. For + * squashfs, I/O is not deferred, it is done immediately in readpage, + * which means the user would have to wait not just for their own I/O + * but the read-ahead I/O as well i.e. completely pointless.squashfs_bdi_init + * will set sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0. + */ + err = squashfs_bdi_init(sb); + if (err) { + errorf(fc, "squashfs init bdi failed"); + return err; + } + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); -- Gitee From 5196327c6f70763d03841f02d0332a2571e7cc29 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 15 Nov 2021 19:53:49 +0800 Subject: [PATCH 118/134] net: phy: fix duplex out of sync problem while changing settings mainline inclusion from mainline commit: a4db9055fdb9cf607775c66d39796caf6439ec92 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a4db9055fdb9cf607775c66d39796caf6439ec92 Signed-off-by: Yu Changchun ------------------------------------------------- As reported by Zhang there's a small issue if in forced mode the duplex mode changes with the link staying up [0]. In this case the MAC isn't notified about the change. The proposed patch relies on the phylib state machine and ignores the fact that there are drivers that uses phylib but not the phylib state machine. So let's don't change the behavior for such drivers and fix it w/o re-adding state PHY_FORCING for the case that phylib state machine is used. [0] https://lore.kernel.org/netdev/a5c26ffd-4ee4-a5e6-4103-873208ce0dc5@huawei.com/T/ Fixes: 2bd229df5e2e ("net: phy: remove state PHY_FORCING") Reported-by: Zhang Changzhong Tested-by: Zhang Changzhong Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/7b8b9456-a93f-abbc-1dc5-a2c2542f932c@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Zhang Changzhong Reviewed-by: Wei Yongjun Reviewed-by: Yue Haibing Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/net/phy/phy.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 5ee7cde0c2e9..db7866b6f752 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -831,7 +831,12 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; /* Restart the PHY */ - _phy_start_aneg(phydev); + if (phy_is_started(phydev)) { + phydev->state = PHY_UP; + phy_trigger_machine(phydev); + } else { + _phy_start_aneg(phydev); + } mutex_unlock(&phydev->lock); return 0; -- Gitee From 8556aedf2dd0b93db9806f7a978be83fee162567 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 30 Nov 2021 16:32:03 +0800 Subject: [PATCH 119/134] ubifs: fix slab-out-of-bounds in ubifs_change_lp maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Hulk Robot reported a KASAN report about slab-out-of-bounds: ================================================================== BUG: KASAN: slab-out-of-bounds in ubifs_change_lp+0x3a9/0x1390 [ubifs] Read of size 8 at addr ffff888101c961f8 by task fsstress/1068 [...] Call Trace: check_memory_region+0x1c1/0x1e0 ubifs_change_lp+0x3a9/0x1390 [ubifs] ubifs_change_one_lp+0x170/0x220 [ubifs] ubifs_garbage_collect+0x7f9/0xda0 [ubifs] ubifs_budget_space+0xfe4/0x1bd0 [ubifs] ubifs_write_begin+0x528/0x10c0 [ubifs] Allocated by task 1068: kmemdup+0x25/0x50 ubifs_lpt_lookup_dirty+0x372/0xb00 [ubifs] ubifs_update_one_lp+0x46/0x260 [ubifs] ubifs_tnc_end_commit+0x98b/0x1720 [ubifs] do_commit+0x6cb/0x1950 [ubifs] ubifs_run_commit+0x15a/0x2b0 [ubifs] ubifs_budget_space+0x1061/0x1bd0 [ubifs] ubifs_write_begin+0x528/0x10c0 [ubifs] [...] ================================================================== In ubifs_garbage_collect(), if ubifs_find_dirty_leb returns an error, lp is an uninitialized variable. But lp.num might be used in the out branch, which is a random value. If the value is -1 or another value that can pass the check, soob may occur in the ubifs_change_lp() in the following procedure. To solve this problem, we initialize lp.lnum to -1, and then initialize it correctly in ubifs_find_dirty_leb, which is not equal to -1, and ubifs_return_leb is executed only when lp.lnum != -1. if find a retained or indexing LEB and continue to next loop, but break before find another LEB, the "taken" flag of this LEB will be cleaned in ubi_return_lebi(). This bug has also been fixed in this patch. Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/gc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index dc3e26e9ed7b..05e1eeae8457 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -692,6 +692,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) for (i = 0; ; i++) { int space_before, space_after; + /* Maybe continue after find and break before find */ + lp.lnum = -1; + cond_resched(); /* Give the commit an opportunity to run */ @@ -843,7 +846,8 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) ubifs_wbuf_sync_nolock(wbuf); ubifs_ro_mode(c, ret); mutex_unlock(&wbuf->io_mutex); - ubifs_return_leb(c, lp.lnum); + if (lp.lnum != -1) + ubifs_return_leb(c, lp.lnum); return ret; } -- Gitee From 3984364bc3db5e2e434b5409394145db8bd3e7e0 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 30 Nov 2021 16:32:04 +0800 Subject: [PATCH 120/134] ubifs: fix double return leb in ubifs_garbage_collect maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- If ubifs_garbage_collect_leb() returns -EAGAIN and enters the "out" branch, ubifs_return_leb will execute twice on the same lnum. This can cause data loss in concurrency situations. Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/gc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 05e1eeae8457..238a60294d10 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -758,6 +758,8 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) err = ubifs_return_leb(c, lp.lnum); if (err) ret = err; + /* Maybe double return if go out */ + lp.lnum = -1; break; } goto out; -- Gitee From 8b9f8be4671a855818fdf826ec4178052aeb1bb7 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 30 Nov 2021 16:32:05 +0800 Subject: [PATCH 121/134] ubifs: read-only if LEB may always be taken in ubifs_garbage_collect maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- If ubifs_garbage_collect_leb() returns -EAGAIN and ubifs_return_leb returns error, a LEB will always has a "taken" flag. In this case, set the ubifs to read-only to prevent a worse situation. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ubifs/gc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 238a60294d10..e43c8161423d 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -756,8 +756,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) * caller instead of the original '-EAGAIN'. */ err = ubifs_return_leb(c, lp.lnum); - if (err) + if (err) { ret = err; + /* LEB may always be "taken". So set + * the ubifs to read-only. Sync wbuf + * will return -EROFS, then go "out". + */ + ubifs_ro_mode(c, ret); + } /* Maybe double return if go out */ lp.lnum = -1; break; -- Gitee From c001574436a8a5f477992703380f2f741d520f35 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Tue, 30 Nov 2021 16:32:06 +0800 Subject: [PATCH 122/134] blkcg: Remove extra blkcg_bio_issue_init maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- KASAN reports a use-after-free report when doing block test: ================================================================== [10050.967049] BUG: KASAN: use-after-free in submit_bio_checks+0x1539/0x1550 [10050.977638] Call Trace: [10050.978190] dump_stack+0x9b/0xce [10050.979674] print_address_description.constprop.6+0x3e/0x60 [10050.983510] kasan_report.cold.9+0x22/0x3a [10050.986089] submit_bio_checks+0x1539/0x1550 [10050.989576] submit_bio_noacct+0x83/0xc80 [10050.993714] submit_bio+0xa7/0x330 [10050.994435] mpage_readahead+0x380/0x500 [10050.998009] read_pages+0x1c1/0xbf0 [10051.002057] page_cache_ra_unbounded+0x4c2/0x6f0 [10051.007413] do_page_cache_ra+0xda/0x110 [10051.008207] force_page_cache_ra+0x23d/0x3d0 [10051.009087] page_cache_sync_ra+0xca/0x300 [10051.009970] generic_file_buffered_read+0xbea/0x2130 [10051.012685] generic_file_read_iter+0x315/0x490 [10051.014472] blkdev_read_iter+0x113/0x1b0 [10051.015300] aio_read+0x2ad/0x450 [10051.023786] io_submit_one+0xc8e/0x1d60 [10051.029855] __se_sys_io_submit+0x125/0x350 [10051.033442] do_syscall_64+0x2d/0x40 [10051.034156] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [10051.048733] Allocated by task 18598: [10051.049482] kasan_save_stack+0x19/0x40 [10051.050263] __kasan_kmalloc.constprop.1+0xc1/0xd0 [10051.051230] kmem_cache_alloc+0x146/0x440 [10051.052060] mempool_alloc+0x125/0x2f0 [10051.052818] bio_alloc_bioset+0x353/0x590 [10051.053658] mpage_alloc+0x3b/0x240 [10051.054382] do_mpage_readpage+0xddf/0x1ef0 [10051.055250] mpage_readahead+0x264/0x500 [10051.056060] read_pages+0x1c1/0xbf0 [10051.056758] page_cache_ra_unbounded+0x4c2/0x6f0 [10051.057702] do_page_cache_ra+0xda/0x110 [10051.058511] force_page_cache_ra+0x23d/0x3d0 [10051.059373] page_cache_sync_ra+0xca/0x300 [10051.060198] generic_file_buffered_read+0xbea/0x2130 [10051.061195] generic_file_read_iter+0x315/0x490 [10051.062189] blkdev_read_iter+0x113/0x1b0 [10051.063015] aio_read+0x2ad/0x450 [10051.063686] io_submit_one+0xc8e/0x1d60 [10051.064467] __se_sys_io_submit+0x125/0x350 [10051.065318] do_syscall_64+0x2d/0x40 [10051.066082] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [10051.067455] Freed by task 13307: [10051.068136] kasan_save_stack+0x19/0x40 [10051.068931] kasan_set_track+0x1c/0x30 [10051.069726] kasan_set_free_info+0x1b/0x30 [10051.070621] __kasan_slab_free+0x111/0x160 [10051.071480] kmem_cache_free+0x94/0x460 [10051.072256] mempool_free+0xd6/0x320 [10051.072985] bio_free+0xe0/0x130 [10051.073630] bio_put+0xab/0xe0 [10051.074252] bio_endio+0x3a6/0x5d0 [10051.074984] blk_update_request+0x590/0x1370 [10051.075870] scsi_end_request+0x7d/0x400 [10051.076667] scsi_io_completion+0x1aa/0xe50 [10051.077503] scsi_softirq_done+0x11b/0x240 [10051.078344] blk_mq_complete_request+0xd4/0x120 [10051.079275] scsi_mq_done+0xf0/0x200 [10051.080036] virtscsi_vq_done+0xbc/0x150 [10051.080850] vring_interrupt+0x179/0x390 [10051.081650] __handle_irq_event_percpu+0xf7/0x490 [10051.082626] handle_irq_event_percpu+0x7b/0x160 [10051.083527] handle_irq_event+0xcc/0x170 [10051.084297] handle_edge_irq+0x215/0xb20 [10051.085122] asm_call_irq_on_stack+0xf/0x20 [10051.085986] common_interrupt+0xae/0x120 [10051.086830] asm_common_interrupt+0x1e/0x40 ================================================================== Bio will be checked at beginning of submit_bio_noacct(). If bio needs to be throttled, it will start the timer and stop submit bio directly. Bio will submit in blk_throtl_dispatch_work_fn() when the timer expires. But in the current process, if bio is throttled, it will still set bio issue->value by blkcg_bio_issue_init(). This is redundant and may cause the above use-after-free. CPU0 CPU1 submit_bio submit_bio_noacct submit_bio_checks blk_throtl_bio() <=mod_timer(&sq->pending_timer blk_throtl_dispatch_work_fn submit_bio_noacct() <= bio have throttle tag, will throw directly and bio issue->value will be set here bio_endio() bio_put() bio_free() <= free this bio blkcg_bio_issue_init(bio) <= bio has been freed and will lead to UAF return BLK_QC_T_NONE Fix this by remove extra blkcg_bio_issue_init. Fixes: e439bedf6b24 (blkcg: consolidate bio_issue_init() to be a part of core) Signed-off-by: Laibin Qiu Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 63e9c0d78537..a50205bcb755 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -897,10 +897,8 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); - if (blk_throtl_bio(bio)) { - blkcg_bio_issue_init(bio); + if (blk_throtl_bio(bio)) return false; - } blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); -- Gitee From 2303356bfaa7b1f555df668a775379f973bc88eb Mon Sep 17 00:00:00 2001 From: Li Hua Date: Tue, 30 Nov 2021 16:32:07 +0800 Subject: [PATCH 123/134] sched/rt: Try to restart rt period timer when rt runtime exceeded maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- When rt_runtime is modified from -1 to a valid control value, it may cause the task to be throttled all the time. Operations like the following will trigger the bug. E.g: 1. echo -1 > /proc/sys/kernel/sched_rt_runtime_us 2. Run a FIFO task named A that executes while(1) 3. echo 950000 > /proc/sys/kernel/sched_rt_runtime_us When rt_runtime is -1, The rt period timer will not be activated when task A enqueued. And then the task will be throttled after setting rt_runtime to 950,000. The task will always be throttled because the rt period timer is not activated. Signed-off-by: Li Hua Reviewed-by: Chen Hui Reviewed-by: Cheng Jian Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- kernel/sched/rt.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad9bec90fb7a..dae1e8eaa983 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -944,6 +944,18 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static inline void try_start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + raw_spin_lock(&rt_b->rt_runtime_lock); + if (!rt_b->rt_period_active) { + rt_b->rt_period_active = 1; + hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS_PINNED_HARD); + } + raw_spin_unlock(&rt_b->rt_runtime_lock); +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -1020,13 +1032,17 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + try_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } @@ -2721,8 +2737,10 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock); } int sched_rt_handler(struct ctl_table *table, int write, void *buffer, -- Gitee From 0b15b21d90a7cc3dfdfb6239bdfbfa1f93624504 Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Tue, 30 Nov 2021 16:32:09 +0800 Subject: [PATCH 124/134] locks: Fix UBSAN undefined behaviour in flock64_to_posix_lock mainline inclusion from mainline-v5.11-rc1 commit 16238415eb9886328a89fe7a3cb0b88c7335fe16 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=16238415eb9886328a89fe7a3cb0b88c7335fe16 Signed-off-by: Yu Changchun ----------------------------------------------- When the sum of fl->fl_start and l->l_len overflows, UBSAN shows the following warning: UBSAN: Undefined behaviour in fs/locks.c:482:29 signed integer overflow: 2 + 9223372036854775806 cannot be represented in type 'long long int' Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xe4/0x14e lib/dump_stack.c:118 ubsan_epilogue+0xe/0x81 lib/ubsan.c:161 handle_overflow+0x193/0x1e2 lib/ubsan.c:192 flock64_to_posix_lock fs/locks.c:482 [inline] flock_to_posix_lock+0x595/0x690 fs/locks.c:515 fcntl_setlk+0xf3/0xa90 fs/locks.c:2262 do_fcntl+0x456/0xf60 fs/fcntl.c:387 __do_sys_fcntl fs/fcntl.c:483 [inline] __se_sys_fcntl fs/fcntl.c:468 [inline] __x64_sys_fcntl+0x12d/0x180 fs/fcntl.c:468 do_syscall_64+0xc8/0x5a0 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix it by parenthesizing 'l->l_len - 1'. Signed-off-by: Luo Meng Signed-off-by: Jeff Layton Signed-off-by: Luo Meng Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index 32c948fe2944..6e118955b9b6 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -542,7 +542,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, if (l->l_len > 0) { if (l->l_len - 1 > OFFSET_MAX - fl->fl_start) return -EOVERFLOW; - fl->fl_end = fl->fl_start + l->l_len - 1; + fl->fl_end = fl->fl_start + (l->l_len - 1); } else if (l->l_len < 0) { if (fl->fl_start + l->l_len < 0) -- Gitee From 6688d8b6e44745230d6dbfddea4cdc429be9c111 Mon Sep 17 00:00:00 2001 From: Huang Guobin Date: Tue, 30 Nov 2021 16:32:13 +0800 Subject: [PATCH 125/134] bonding: Fix a use-after-free problem when bond_sysfs_slave_add() failed mainline inclusion from mainline commit b93c6a911a3fe926b00add28f3b932007827c4ca category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b93c6a911a3fe926b00add28f3b932007827c4ca Signed-off-by: Yu Changchun ----------------------------- When I do fuzz test for bonding device interface, I got the following use-after-free Calltrace: ================================================================== BUG: KASAN: use-after-free in bond_enslave+0x1521/0x24f0 Read of size 8 at addr ffff88825bc11c00 by task ifenslave/7365 CPU: 5 PID: 7365 Comm: ifenslave Tainted: G E 5.15.0-rc1+ #13 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 Call Trace: dump_stack_lvl+0x6c/0x8b print_address_description.constprop.0+0x48/0x70 kasan_report.cold+0x82/0xdb __asan_load8+0x69/0x90 bond_enslave+0x1521/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f19159cf577 Code: b3 66 90 48 8b 05 11 89 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 78 RSP: 002b:00007ffeb3083c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffeb3084bca RCX: 00007f19159cf577 RDX: 00007ffeb3083ce0 RSI: 0000000000008990 RDI: 0000000000000003 RBP: 00007ffeb3084bc4 R08: 0000000000000040 R09: 0000000000000000 R10: 00007ffeb3084bc0 R11: 0000000000000246 R12: 00007ffeb3083ce0 R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffeb3083cb0 Allocated by task 7365: kasan_save_stack+0x23/0x50 __kasan_kmalloc+0x83/0xa0 kmem_cache_alloc_trace+0x22e/0x470 bond_enslave+0x2e1/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 7365: kasan_save_stack+0x23/0x50 kasan_set_track+0x20/0x30 kasan_set_free_info+0x24/0x40 __kasan_slab_free+0xf2/0x130 kfree+0xd1/0x5c0 slave_kobj_release+0x61/0x90 kobject_put+0x102/0x180 bond_sysfs_slave_add+0x7a/0xa0 bond_enslave+0x11b6/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Last potentially related work creation: kasan_save_stack+0x23/0x50 kasan_record_aux_stack+0xb7/0xd0 insert_work+0x43/0x190 __queue_work+0x2e3/0x970 delayed_work_timer_fn+0x3e/0x50 call_timer_fn+0x148/0x470 run_timer_softirq+0x8a8/0xc50 __do_softirq+0x107/0x55f Second to last potentially related work creation: kasan_save_stack+0x23/0x50 kasan_record_aux_stack+0xb7/0xd0 insert_work+0x43/0x190 __queue_work+0x2e3/0x970 __queue_delayed_work+0x130/0x180 queue_delayed_work_on+0xa7/0xb0 bond_enslave+0xe25/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88825bc11c00 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 0 bytes inside of 1024-byte region [ffff88825bc11c00, ffff88825bc12000) The buggy address belongs to the page: page:ffffea00096f0400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x25bc10 head:ffffea00096f0400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0x57ff00000010200(slab|head|node=1|zone=2|lastcpupid=0x7ff) raw: 057ff00000010200 ffffea0009a71c08 ffff888240001968 ffff88810004dbc0 raw: 0000000000000000 00000000000a000a 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88825bc11b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88825bc11b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88825bc11c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88825bc11c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88825bc11d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Put new_slave in bond_sysfs_slave_add() will cause use-after-free problems when new_slave is accessed in the subsequent error handling process. Since new_slave will be put in the subsequent error handling process, remove the unnecessary put to fix it. In addition, when sysfs_create_file() fails, if some files have been crea- ted successfully, we need to call sysfs_remove_file() to remove them. Since there are sysfs_create_files() & sysfs_remove_files() can be used, use these two functions instead. Fixes: 7afcaec49696 (bonding: use kobject_put instead of _del after kobject_add) Signed-off-by: Huang Guobin Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Huang Guobin Reviewed-by: Wei Yongjun Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- drivers/net/bonding/bond_sysfs_slave.c | 36 ++++++++------------------ 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index fd07561da034..6a6cdd0bb258 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -108,15 +108,15 @@ static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) } static SLAVE_ATTR_RO(ad_partner_oper_port_state); -static const struct slave_attribute *slave_attrs[] = { - &slave_attr_state, - &slave_attr_mii_status, - &slave_attr_link_failure_count, - &slave_attr_perm_hwaddr, - &slave_attr_queue_id, - &slave_attr_ad_aggregator_id, - &slave_attr_ad_actor_oper_port_state, - &slave_attr_ad_partner_oper_port_state, +static const struct attribute *slave_attrs[] = { + &slave_attr_state.attr, + &slave_attr_mii_status.attr, + &slave_attr_link_failure_count.attr, + &slave_attr_perm_hwaddr.attr, + &slave_attr_queue_id.attr, + &slave_attr_ad_aggregator_id.attr, + &slave_attr_ad_actor_oper_port_state.attr, + &slave_attr_ad_partner_oper_port_state.attr, NULL }; @@ -137,24 +137,10 @@ const struct sysfs_ops slave_sysfs_ops = { int bond_sysfs_slave_add(struct slave *slave) { - const struct slave_attribute **a; - int err; - - for (a = slave_attrs; *a; ++a) { - err = sysfs_create_file(&slave->kobj, &((*a)->attr)); - if (err) { - kobject_put(&slave->kobj); - return err; - } - } - - return 0; + return sysfs_create_files(&slave->kobj, slave_attrs); } void bond_sysfs_slave_del(struct slave *slave) { - const struct slave_attribute **a; - - for (a = slave_attrs; *a; ++a) - sysfs_remove_file(&slave->kobj, &((*a)->attr)); + sysfs_remove_files(&slave->kobj, slave_attrs); } -- Gitee From 95e488ccf33b95419dc963e105feee06a2b44ea8 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 30 Nov 2021 16:32:15 +0800 Subject: [PATCH 126/134] block: avoid quiesce while elevator init maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ----------------------------------------------- As blk_mq_quiesce_queue in elevator_init_mq will wait a RCU gap which want to make sure no IO will happen while blk_mq_init_sched. If there is lots of device will lead to boot slowly. To address this issue, according to Lei Ming's suggestion: "We are called before adding disk, when there isn't any FS I/O, so freezing queue plus canceling dispatch work is enough to drain any dispatch activities originated from passthrough requests, then no need to quiesce queue which may add long boot latency, especially when lots of disks are involved." Signed-off-by: Ye Bin Reviewed-by: Jason Yan Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-mq.c | 13 +++++++++++++ block/blk-mq.h | 1 + block/elevator.c | 10 ++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1af4f9bbc0fe..a9aac1dd0f89 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3992,6 +3992,19 @@ unsigned int blk_mq_rq_cpu(struct request *rq) } EXPORT_SYMBOL(blk_mq_rq_cpu); +void blk_mq_cancel_work_sync(struct request_queue *q) +{ + if (queue_is_mq(q)) { + struct blk_mq_hw_ctx *hctx; + int i; + + cancel_delayed_work_sync(&q->requeue_work); + + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); + } +} + static int __init blk_mq_init(void) { int i; diff --git a/block/blk-mq.h b/block/blk-mq.h index f792a0920ebb..6f87c0681443 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -129,6 +129,7 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, diff --git a/block/elevator.c b/block/elevator.c index 4ce6b22813a1..27eb70ec277a 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -684,12 +684,18 @@ void elevator_init_mq(struct request_queue *q) if (!e) return; + /* + * We are called before adding disk, when there isn't any FS I/O, + * so freezing queue plus canceling dispatch work is enough to + * drain any dispatch activities originated from passthrough + * requests, then no need to quiesce queue which may add long boot + * latency, especially when lots of disks are involved. + */ blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); if (err) { -- Gitee From 2928914245903fffb12db115de3e46186b0757ee Mon Sep 17 00:00:00 2001 From: Lakshmi Ramasubramanian Date: Tue, 30 Nov 2021 16:32:58 +0800 Subject: [PATCH 127/134] ima: Fix warning: no previous prototype for function 'ima_add_kexec_buffer' mainline inclusion from mainline-5.14 commit: c67913492fec317bc53ffdff496b6ba856d2868c category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c67913492fec317bc53ffdff496b6ba856d2868c Signed-off-by: Yu Changchun --------------------------- The function prototype for ima_add_kexec_buffer() is present in 'linux/ima.h'. But this header file is not included in ima_kexec.c where the function is implemented. This results in the following compiler warning when "-Wmissing-prototypes" flag is turned on: security/integrity/ima/ima_kexec.c:81:6: warning: no previous prototype for function 'ima_add_kexec_buffer' [-Wmissing-prototypes] Include the header file 'linux/ima.h' in ima_kexec.c to fix the compiler warning. Fixes: dce92f6b11c3 (arm64: Enable passing IMA log to next kernel on kexec) Reported-by: kernel test robot Signed-off-by: Lakshmi Ramasubramanian Acked-by: Rob Herring Signed-off-by: Mimi Zohar Signed-off-by: Guo Zihua Reviewed-by: Xiu Jianfeng Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- security/integrity/ima/ima_kexec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index e29bea3dd4cc..a5ded568cc16 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "ima.h" #ifdef CONFIG_IMA_KEXEC -- Gitee From 40e990133f1a49d34a8e3b2716ac9c69a732ddce Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 30 Nov 2021 16:32:59 +0800 Subject: [PATCH 128/134] ext4: fix e2fsprogs checksum failure for mounted filesystem mainline inclusion from mainline-v5.15 commit b2bbb92f7042e8075fb036bf97043339576330c3 category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b2bbb92f7042e8075fb036bf97043339576330c3 Signed-off-by: Yu Changchun ------------------------------------------------- Commit 81414b4dd48 ("ext4: remove redundant sb checksum recomputation") removed checksum recalculation after updating superblock free space / inode counters in ext4_fill_super() based on the fact that we will recalculate the checksum on superblock writeout. That is correct assumption but until the writeout happens (which can take a long time) the checksum is incorrect in the buffer cache and if programs such as tune2fs or resize2fs is called shortly after a file system is mounted can fail. So return back the checksum recalculation and add a comment explaining why. Fixes: 81414b4dd48f ("ext4: remove redundant sb checksum recomputation") Cc: stable@kernel.org Reported-by: Boyang Xue Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210812124737.21981-1-jack@suse.cz Signed-off-by: Zheng Liang Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16e228285967..68aa31dc30dd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5001,6 +5001,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } + /* + * Update the checksum after updating free space/inode + * counters. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); -- Gitee From cfe5a4b1a35d07048e3713784ed6cd2bb24d258f Mon Sep 17 00:00:00 2001 From: Kevin Locke Date: Tue, 30 Nov 2021 16:33:00 +0800 Subject: [PATCH 129/134] ovl: warn about orphan metacopy mainline inclusion from mainline-v5.11-rc1 commit 0a8d0b64dd6acfbc9e9b79022654bbe1ade4a29a category: bugfix issue: #I4PYGY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0a8d0b64dd6acfbc9e9b79022654bbe1ade4a29a Signed-off-by: Yu Changchun ------------------------------------------------- When the lower file of a metacopy is inaccessible, -EIO is returned. For users not familiar with overlayfs internals, such as myself, the meaning of this error may not be apparent or easy to determine, since the (metacopy) file is present and open/stat succeed when accessed outside of the overlay. Add a rate-limited warning for orphan metacopy to give users a hint when investigating such errors. Link: https://lore.kernel.org/linux-unionfs/CAOQ4uxi23Zsmfb4rCed1n=On0NNA5KZD74jjjeyz+et32sk-gg@mail.gmail.com/ Signed-off-by: Kevin Locke Signed-off-by: Miklos Szeredi Signed-off-by: Zheng Liang Reviewed-by: Zhang Yi Signed-off-by: Chen Jun Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/overlayfs/namei.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 092812c2f118..b1c8a38573c9 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1004,6 +1004,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * Just make sure a corresponding data dentry has been found. */ if (d.metacopy || (uppermetacopy && !ctr)) { + pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", + dentry); err = -EIO; goto out_put; } else if (!d.is_dir && upperdentry && !ctr && origin_path) { -- Gitee From 8f3ccc8a7e79a5b98828991e4526631a4d9f31b2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Dec 2021 18:22:53 +0800 Subject: [PATCH 130/134] block: return errors from blk_execute_rq() mainline inclusion from mainline-v5.14-rc1 commit fb9b16e15cd70e21d8af7f03d700deb9509c2ce8 category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ----------------------------------------- The synchronous blk_execute_rq() had not provided a way for its callers to know if its request was successful or not. Return the blk_status_t result of the request. Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210610214437.641245-4-kbusch@kernel.org Signed-off-by: Jens Axboe conflict: 1. in blkdev.h and blk-exec, blk_execute_rq return value change; 2. input parameter in blk_execute_rq is not the same as mainline; Signed-off-by: zhangwensheng Reviewed-by: qiulaibin Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/blk-exec.c | 7 +++++-- include/linux/blkdev.h | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/block/blk-exec.c b/block/blk-exec.c index 85324d53d072..b2676de4c6a5 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -21,7 +21,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) { struct completion *waiting = rq->end_io_data; - rq->end_io_data = NULL; + rq->end_io_data = (void *)(uintptr_t)error; /* * complete last, if this is a stack request the process (and thus @@ -75,8 +75,9 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). */ -void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, +blk_status_t blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); @@ -91,5 +92,7 @@ void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else wait_for_completion_io(&wait); + + return (blk_status_t)(uintptr_t)rq->end_io_data; } EXPORT_SYMBOL(blk_execute_rq); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d6ae309a7dc3..31febd3e14de 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -951,10 +951,10 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern void blk_execute_rq(struct request_queue *, struct gendisk *, - struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +blk_status_t blk_execute_rq(struct request_queue *, struct gendisk *, + struct request *, int); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(unsigned int op); -- Gitee From 23a59bc599a3c7923bea4e2f82a4ff6e3ff8cf38 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 10 Dec 2021 18:22:51 +0800 Subject: [PATCH 131/134] bfq: Remove merged request already in bfq_requests_merged() mainline inclusion from mainline-v5.14-rc1 commit a921c655f2033dd1ce1379128efe881dda23ea37 category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- Currently, bfq does very little in bfq_requests_merged() and handles all the request cleanup in bfq_finish_requeue_request() called from blk_mq_free_request(). That is currently safe only because blk_mq_free_request() is called shortly after bfq_requests_merged() while bfqd->lock is still held. However to fix a lock inversion between bfqd->lock and ioc->lock, we need to call blk_mq_free_request() after dropping bfqd->lock. That would mean that already merged request could be seen by other processes inside bfq queues and possibly dispatched to the device which is wrong. So move cleanup of the request from bfq_finish_requeue_request() to bfq_requests_merged(). Acked-by: Paolo Valente Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210623093634.27879-2-jack@suse.cz Signed-off-by: Jens Axboe conflict: in bfq_finish_requeue_request, code in hulk have the line atomic_dec(&rq->mq_hctx->elevator_queued); that is conflicted; Signed-off-by: zhangwensheng Reviewed-by: qiulaibin Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/bfq-iosched.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fd3c23d516b8..27e01b4cd528 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2326,7 +2326,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, *next_bfqq = bfq_init_rq(next); if (!bfqq) - return; + goto remove; /* * If next and rq belong to the same bfq_queue and next is older @@ -2349,6 +2349,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +remove: + /* Merged request may be in the IO scheduler. Remove it. */ + if (!RB_EMPTY_NODE(&next->rb_node)) { + bfq_remove_request(next->q, next); + if (next_bfqq) + bfqg_stats_update_io_remove(bfqq_group(next_bfqq), + next->cmd_flags); + } } /* Must be called with bfqq != NULL */ @@ -5901,6 +5909,7 @@ static void bfq_finish_requeue_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; + unsigned long flags; /* * rq either is not associated with any icq, or is an already @@ -5918,40 +5927,16 @@ static void bfq_finish_requeue_request(struct request *rq) rq->io_start_time_ns, rq->cmd_flags); + spin_lock_irqsave(&bfqd->lock, flags); if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - if (rq == bfqd->waited_rq) bfq_update_inject_limit(bfqd, bfqq); bfq_completed_request(bfqq, bfqd); - bfq_finish_requeue_request_body(bfqq); atomic_dec(&rq->mq_hctx->elevator_queued); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, - * in which case we need to remove it (this should - * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. - * This situation seems to occur only in process - * context, as a consequence of a merge. In the - * current version of the code, this implies that the - * lock is held. - */ - - if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } - bfq_finish_requeue_request_body(bfqq); } + bfq_finish_requeue_request_body(bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); /* * Reset private fields. In case of a requeue, this allows -- Gitee From 0cb8b002d2f4713c9bb0da10492f242238da0737 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 10 Dec 2021 18:22:52 +0800 Subject: [PATCH 132/134] blk: Fix lock inversion between ioc lock and bfqd lock mainline inclusion from mainline-v5.14-rc1 commit fd2ef39cc9a6b9c4c41864ac506906c52f94b06a category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun ------------------------------ Lockdep complains about lock inversion between ioc->lock and bfqd->lock: bfqd -> ioc: put_io_context+0x33/0x90 -> ioc->lock grabbed blk_mq_free_request+0x51/0x140 blk_put_request+0xe/0x10 blk_attempt_req_merge+0x1d/0x30 elv_attempt_insert_merge+0x56/0xa0 blk_mq_sched_try_insert_merge+0x4b/0x60 bfq_insert_requests+0x9e/0x18c0 -> bfqd->lock grabbed blk_mq_sched_insert_requests+0xd6/0x2b0 blk_mq_flush_plug_list+0x154/0x280 blk_finish_plug+0x40/0x60 ext4_writepages+0x696/0x1320 do_writepages+0x1c/0x80 __filemap_fdatawrite_range+0xd7/0x120 sync_file_range+0xac/0xf0 ioc->bfqd: bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed put_io_context_active+0x78/0xb0 -> ioc->lock grabbed exit_io_context+0x48/0x50 do_exit+0x7e9/0xdd0 do_group_exit+0x54/0xc0 To avoid this inversion we change blk_mq_sched_try_insert_merge() to not free the merged request but rather leave that upto the caller similarly to blk_mq_sched_try_merge(). And in bfq_insert_requests() we make sure to free all the merged requests after dropping bfqd->lock. Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Reviewed-by: Ming Lei Acked-by: Paolo Valente Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210623093634.27879-3-jack@suse.cz Signed-off-by: Jens Axboe Conflict: 1. mainline include/linux/elevator.h file change; 2. mainline block/mq-deadline-main.c file change, now change to block/mq-deadline.c; Signed-off-by: zhangwensheng Reviewed-by: qiulaibin Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- block/bfq-iosched.c | 6 ++++-- block/blk-merge.c | 19 ++++++++----------- block/blk-mq-sched.c | 5 +++-- block/blk-mq-sched.h | 3 ++- block/blk-mq.h | 11 +++++++++++ block/blk.h | 2 +- block/elevator.c | 11 ++++++++--- block/mq-deadline.c | 5 ++++- include/linux/elevator.h | 3 ++- 9 files changed, 43 insertions(+), 22 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 27e01b4cd528..aa1a808fa072 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2235,9 +2235,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock_irq(&bfqd->lock); if (free) blk_mq_free_request(free); - spin_unlock_irq(&bfqd->lock); return ret; } @@ -5508,14 +5508,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; + LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); return; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 26f4bcc10de9..6518e0ae2835 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -831,18 +831,15 @@ static struct request *attempt_front_merge(struct request_queue *q, return NULL; } -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next) +/* + * Try to merge 'next' into 'rq'. Return true if the merge happened, false + * otherwise. The caller is responsible for freeing 'next' if the merge + * happened. + */ +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) { - struct request *free; - - free = attempt_merge(q, rq, next); - if (free) { - blk_put_request(free); - return 1; - } - - return 0; + return attempt_merge(q, rq, next); } bool blk_rq_merge_ok(struct request *rq, struct bio *bio) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a3266541bd06..606bef13f1c2 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -381,9 +381,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return ret; } -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0476360f05f1..15f3d611db10 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -12,7 +12,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); diff --git a/block/blk-mq.h b/block/blk-mq.h index 6f87c0681443..7f3194657dff 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -283,6 +283,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, return NULL; } +/* Free all requests on the list */ +static inline void blk_mq_free_requests(struct list_head *list) +{ + while (!list_empty(list)) { + struct request *rq = list_entry_rq(list->next); + + list_del_init(&rq->queuelist); + blk_mq_free_request(rq); + } +} + /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. diff --git a/block/blk.h b/block/blk.h index ab99c522a31e..ca4de4985946 100644 --- a/block/blk.h +++ b/block/blk.h @@ -229,7 +229,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); diff --git a/block/elevator.c b/block/elevator.c index 27eb70ec277a..65dfc7559a36 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -353,9 +353,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * - * Returns true if we merged, false otherwise + * Returns true if we merged, false otherwise. 'free' will contain all + * requests that need to be freed. */ -bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { struct request *__rq; bool ret; @@ -366,8 +368,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) /* * First try one-hit cache. */ - if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { + list_add(&rq->queuelist, free); return true; + } if (blk_queue_noxmerges(q)) return false; @@ -381,6 +385,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; + list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index e4e90761eab3..43994cce1eb2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -489,6 +489,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + LIST_HEAD(free); /* * This may be a requeue of a write request that has locked its @@ -496,8 +497,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); - if (blk_mq_sched_try_insert_merge(q, rq)) + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + blk_mq_free_requests(&free); return; + } blk_mq_sched_request_inserted(rq); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index bc26b4e11f62..0bb7489e0cfb 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); -extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, + struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -- Gitee From d3e8d841116fb67568ee253e400f0279730ce634 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Fri, 3 Dec 2021 18:15:21 +0800 Subject: [PATCH 133/134] ext4: stop IO for page without buffer_head maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- dio_bio_complete will set page dirty without consider is there still buffer_head valid with this page. This will trigger some problem while ext4 try to writeback this page. For ext4, we fix it by skip writeback the page without buffer_head. [1] https://lwn.net/Articles/774411/ : "DMA and get_user_pages()" [2] https://lwn.net/Articles/753027/ : "The Trouble with get_user_pages()" [3] https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc1d8e7cca2daa18 Signed-off-by: yangerkun Reviewed-by: zhangyi (F) Signed-off-by: Yang Yingliang Signed-off-by: Xie XiuQi Conflicts: fs/ext4/inode.c Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/ext4/inode.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dd137a1c648f..ef102cca42dc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1937,6 +1937,20 @@ static int __ext4_journalled_writepage(struct page *page, return ret; } +static void cancel_page_dirty_status(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + cancel_dirty_page(page); + xa_lock_irqsave(&mapping->i_pages, flags); + __xa_clear_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); + __xa_clear_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_TOWRITE); + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -1995,6 +2009,12 @@ static int ext4_writepage(struct page *page, return -EIO; } + if (WARN_ON(!page_has_buffers(page))) { + cancel_page_dirty_status(page); + unlock_page(page); + return 0; + } + trace_ext4_writepage(page); size = i_size_read(inode); if (page->index == size >> PAGE_SHIFT && @@ -2603,6 +2623,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) continue; } + if (WARN_ON(!page_has_buffers(page))) { + cancel_page_dirty_status(page); + unlock_page(page); + continue; + } + wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); -- Gitee From c84465a6d4e831ee170978f2814e7bc42c2f472a Mon Sep 17 00:00:00 2001 From: yangerkun Date: Fri, 26 Nov 2021 16:27:16 +0800 Subject: [PATCH 134/134] hugetlbfs: avoid overflow in hugetlbfs_fallocate maillist inclusion category: bugfix issue: #I4PYGY CVE: NA Signed-off-by: Yu Changchun --------------------------- luojiajun report a problem[1] two years ago which seems still exists in mainline. vfs_fallocate can avoid 'offset + len' trigger overflow, but 'offset + len + hpage_size - 1' may overflow too and will lead to a wrong 'end'. luojiajun give a solution which can fix the wrong 'end' but leave the overflow still happened. Fix it with DIV_ROUND_UP_ULL. [1] https://patchwork.kernel.org/project/linux-mm/patch/1554775226-67213-1-git-send-email-luojiajun3@huawei.com/ Signed-off-by: yangerkun Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai Signed-off-by: Yu Changchun --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5fc9ccab907c..c7dec890f9d9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -655,7 +655,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as well as being converted to page offsets. */ start = offset >> hpage_shift; - end = (offset + len + hpage_size - 1) >> hpage_shift; + end = DIV_ROUND_UP_ULL(offset + len, hpage_size); inode_lock(inode); -- Gitee