From 5341435b06baddcf7b10aae3a64a329e4a470ddc Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 17 Jul 2024 15:00:23 +0200 Subject: [PATCH 1/3] refcount: Report UAF for refcount_sub_and_test(0) when counter==0 stable inclusion from stable-6.11 commit f91f7ac900e7342e0fd66093dfbf7cb8cb585a99 category: bugfix issue: #IAM056 CVE: NA Signed-off-by: wangxin --------------------------------------- When a reference counter is at zero and refcount_sub_and_test() is invoked to subtract zero, the function accepts this request without any warning and returns true. This behavior does not seem ideal because the counter being already at zero indicates a use-after-free. Furthermore, returning true by refcount_sub_and_test() in this case potentially results in a double-free done by its caller. Modify the underlying function __refcount_sub_and_test() to warn about this case as a use-after-free and have it return false to avoid the potential double-free. Signed-off-by: Petr Pavlu Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240717130023.5675-1-petr.pavlu@suse.com Signed-off-by: Kees Cook Signed-off-by: wangxin --- drivers/misc/lkdtm/refcount.c | 15 +++++++++++++++ include/linux/refcount.h | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/misc/lkdtm/refcount.c b/drivers/misc/lkdtm/refcount.c index de7c5ab528d9..af384fd6935c 100644 --- a/drivers/misc/lkdtm/refcount.c +++ b/drivers/misc/lkdtm/refcount.c @@ -182,6 +182,21 @@ void lkdtm_REFCOUNT_SUB_AND_TEST_NEGATIVE(void) check_negative(&neg, 3); } +/* + * A refcount_sub_and_test() by zero when the counter is at zero should act like + * refcount_sub_and_test() above when going negative. + */ +static void lkdtm_REFCOUNT_SUB_AND_TEST_ZERO(void) +{ + refcount_t neg = REFCOUNT_INIT(0); + + pr_info("attempting bad refcount_sub_and_test() at zero\n"); + if (refcount_sub_and_test(0, &neg)) + pr_warn("Weird: refcount_sub_and_test() reported zero\n"); + + check_negative(&neg, 0); +} + static void check_from_zero(refcount_t *ref) { switch (refcount_read(ref)) { diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 497990c69b0b..3f17de9f7de4 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -274,12 +274,12 @@ static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, in if (oldp) *oldp = old; - if (old == i) { + if (old > 0 && old == i) { smp_acquire__after_ctrl_dep(); return true; } - if (unlikely(old < 0 || old - i < 0)) + if (unlikely(old <= 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; -- Gitee From e54a1bfd4d434dd6b89c57077edb8d6389a3db72 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 2 Aug 2024 16:58:22 -0700 Subject: [PATCH 2/3] memcg: protect concurrent access to mem_cgroup_idr stable inclusion from stable-6.11 commit 9972605a238339b85bd16b084eed5f18414d22db category: bugfix issue: NA CVE: NA Signed-off-by: wangxin --------------------------------------- Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") decoupled the memcg IDs from the CSS ID space to fix the cgroup creation failures. It introduced IDR to maintain the memcg ID space. The IDR depends on external synchronization mechanisms for modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace() happen within css callback and thus are protected through cgroup_mutex from concurrent modifications. However idr_remove() for mem_cgroup_idr was not protected against concurrency and can be run concurrently for different memcgs when they hit their refcnt to zero. Fix that. We have been seeing list_lru based kernel crashes at a low frequency in our fleet for a long time. These crashes were in different part of list_lru code including list_lru_add(), list_lru_del() and reparenting code. Upon further inspection, it looked like for a given object (dentry and inode), the super_block's list_lru didn't have list_lru_one for the memcg of that object. The initial suspicions were either the object is not allocated through kmem_cache_alloc_lru() or somehow memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but returned success. No evidence were found for these cases. Looking more deeply, we started seeing situations where valid memcg's id is not present in mem_cgroup_idr and in some cases multiple valid memcgs have same id and mem_cgroup_idr is pointing to one of them. So, the most reasonable explanation is that these situations can happen due to race between multiple idr_remove() calls or race between idr_alloc()/idr_replace() and idr_remove(). These races are causing multiple memcgs to acquire the same ID and then offlining of one of them would cleanup list_lrus on the system for all of them. Later access from other memcgs to the list_lru cause crashes due to missing list_lru_one. Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Signed-off-by: Shakeel Butt Acked-by: Muchun Song Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: wangxin --- mm/memcontrol.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ead591002b0c..15ffaefba638 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5199,11 +5199,28 @@ static struct cftype mem_cgroup_legacy_files[] = { */ static DEFINE_IDR(mem_cgroup_idr); +static DEFINE_SPINLOCK(memcg_idr_lock); + +static int mem_cgroup_alloc_id(void) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock(&memcg_idr_lock); + ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, + GFP_NOWAIT); + spin_unlock(&memcg_idr_lock); + idr_preload_end(); + return ret; +} static void mem_cgroup_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { + spin_lock(&memcg_idr_lock); idr_remove(&mem_cgroup_idr, memcg->id.id); + spin_unlock(&memcg_idr_lock); + memcg->id.id = 0; } } @@ -5338,9 +5355,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error); - memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, - 1, MEM_CGROUP_ID_MAX, - GFP_KERNEL); + memcg->id.id = mem_cgroup_alloc_id(); if (memcg->id.id < 0) { error = memcg->id.id; goto fail; -- Gitee From d61be554a39f4a7c0e536579e543b6d872ea6785 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 9 Aug 2024 11:16:28 +0800 Subject: [PATCH 3/3] vfs: Don't evict inode under the inode lru traversing context stable inclusion from stable-6.11 commit 2a0629834cd82f05d424bbc193374f9a43d1f87d category: bugfix issue: NA CVE: NA Signed-off-by: wangxin --------------------------------------- The inode reclaiming process(See function prune_icache_sb) collects all reclaimable inodes and mark them with I_FREEING flag at first, at that time, other processes will be stuck if they try getting these inodes (See function find_inode_fast), then the reclaiming process destroy the inodes by function dispose_list(). Some filesystems(eg. ext4 with ea_inode feature, ubifs with xattr) may do inode lookup in the inode evicting callback function, if the inode lookup is operated under the inode lru traversing context, deadlock problems may happen. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Case 1: In function ext4_evict_inode(), the ea inode lookup could happen if ea_inode feature is enabled, the lookup process will be stuck under the evicting context like this: 1. File A has inode i_reg and an ea inode i_ea 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea 3. Then, following three processes running like this: PA PB echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // i_reg is added into lru, lru->i_ea->i_reg prune_icache_sb list_lru_walk_one inode_lru_isolate i_ea->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(i_reg) spin_unlock(&i_reg->i_lock) spin_unlock(lru_lock) rm file A i_reg->nlink = 0 iput(i_reg) // i_reg->nlink is 0, do evict ext4_evict_inode ext4_xattr_delete_inode ext4_xattr_inode_dec_ref_all ext4_xattr_inode_iget ext4_iget(i_ea->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(i_ea) ----→ AA deadlock dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&i_ea->i_state) Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file deleting process holds BASEHD's wbuf->io_mutex while getting the xattr inode, which could race with inode reclaiming process(The reclaiming process could try locking BASEHD's wbuf->io_mutex in inode evicting function), then an ABBA deadlock problem would happen as following: 1. File A has inode ia and a xattr(with inode ixa), regular file B has inode ib and a xattr. 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa 3. Then, following three processes running like this: PA PB PC echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // ib and ia are added into lru, lru->ixa->ib->ia prune_icache_sb list_lru_walk_one inode_lru_isolate ixa->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(ib) spin_unlock(&ib->i_lock) spin_unlock(lru_lock) rm file B ib->nlink = 0 rm file A iput(ia) ubifs_evict_inode(ia) ubifs_jnl_delete_inode(ia) ubifs_jnl_write_inode(ia) make_reservation(BASEHD) // Lock wbuf->io_mutex ubifs_iget(ixa->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(ixa) | iput(ib) // ib->nlink is 0, do evict | ubifs_evict_inode | ubifs_jnl_delete_inode(ib) ↓ ubifs_jnl_write_inode ABBA deadlock ←-----make_reservation(BASEHD) dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&ixa->i_state) Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING to pin the inode in memory while inode_lru_isolate() reclaims its pages instead of using ordinary inode reference. This way inode deletion cannot be triggered from inode_lru_isolate() thus avoiding the deadlock. evict() is made to wait for I_LRU_ISOLATING to be cleared before proceeding with inode cleanup. Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022 Fixes: e50e5129f384 ("ext4: xattr-in-inode support") Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files") Cc: stable@vger.kernel.org Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240809031628.1069873-1-chengzhihao@huaweicloud.com Reviewed-by: Jan Kara Suggested-by: Jan Kara Suggested-by: Mateusz Guzik Signed-off-by: Christian Brauner Signed-off-by: wangxin --- fs/inode.c | 39 +++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 5 +++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 06b23a65c642..88b14cd3e861 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -455,6 +455,39 @@ static void inode_lru_list_del(struct inode *inode) this_cpu_dec(nr_unused); } +static void inode_pin_lru_isolating(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode->i_state |= I_LRU_ISOLATING; +} + +static void inode_unpin_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); + inode->i_state &= ~I_LRU_ISOLATING; + smp_mb(); + wake_up_bit(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); +} + +static void inode_wait_for_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_state & I_LRU_ISOLATING) { + DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); + __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_LRU_ISOLATING); + } + spin_unlock(&inode->i_lock); +} + /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add @@ -567,6 +600,8 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + inode_wait_for_lru_isolating(inode); + /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since @@ -766,7 +801,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, } if (inode_has_buffers(inode) || inode->i_data.nrpages) { - __iget(inode); + inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); if (remove_inode_buffers(inode)) { @@ -779,7 +814,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, if (current->reclaim_state) current->reclaim_state->reclaimed_slab += reap; } - iput(inode); + inode_unpin_lru_isolating(inode); spin_lock(lru_lock); return LRU_RETRY; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a7cd19c3d3d0..784b1316beae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2211,6 +2211,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2233,6 +2236,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_CREATING (1 << 15) #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) +#define __I_LRU_ISOLATING 19 +#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -- Gitee