From d42b2c739afc5afced6a615fde9fc04f9db2d2da Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 May 2025 03:45:25 +0000 Subject: [PATCH 1/9] rcu: provide a static initializer for hlist_nulls_head mainline inclusion from mainline-v6.15-rc1 commit 8c6bbda879b62f16bb03321a84554b4f63415c55 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8c6bbda879b62f16bb03321a84554b4f63415c55 ---------------------------------------------------------------------- Patch series "ucount: Simplify refcounting with rcuref_t". I noticed that the atomic_dec_and_lock_irqsave() in put_ucounts() loops sometimes even during boot. Something like 2-3 iterations but still. This series replaces the refcounting with rcuref_t and adds a RCU lookup. This allows a lockless lookup in alloc_ucounts() if the entry is available and a cmpxchg()less put of the item. This patch (of 4): Provide a static initializer for hlist_nulls_head so that it can be used in statically defined data structures. Link: https://lkml.kernel.org/r/20250203150525.456525-1-bigeasy@linutronix.de Link: https://lkml.kernel.org/r/20250203150525.456525-2-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Steven Rostedt Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: "Paul E . McKenney" Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton Signed-off-by: Chen Ridong --- include/linux/list_nulls.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index fa6e8471bd22..248db9b77ee2 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -28,6 +28,7 @@ struct hlist_nulls_node { #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) +#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) -- Gitee From 22f1675374cd8130e377e388cafca380438d6f01 Mon Sep 17 00:00:00 2001 From: MengEn Sun Date: Tue, 27 May 2025 03:45:26 +0000 Subject: [PATCH 2/9] ucounts: move kfree() out of critical zone protected by ucounts_lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.14-rc1 commit f49b42d415a32faee6bc08923821f432f64a4e90 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f49b42d415a32faee6bc08923821f432f64a4e90 ---------------------------------------------------------------------- Although kfree is a non-sleep function, it is possible to enter a long chain of calls probabilistically, so it looks better to move kfree from alloc_ucounts() out of the critical zone of ucounts_lock. Link: https://lkml.kernel.org/r/1733458427-11794-1-git-send-email-mengensun@tencent.com Signed-off-by: MengEn Sun Reviewed-by: YueHong Wu Reviewed-by: Andrew Morton Cc: Andrei Vagin Cc: Joel Granados Cc: Thomas Weißschuh Signed-off-by: Andrew Morton Signed-off-by: Chen Ridong --- kernel/ucount.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/ucount.c b/kernel/ucount.c index 584b73807c44..2c929c6c4784 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -169,8 +169,8 @@ struct ucounts *get_ucounts(struct ucounts *ucounts) struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); - struct ucounts *ucounts, *new; bool wrapped; + struct ucounts *ucounts, *new = NULL; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -187,17 +187,17 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (ucounts) { - kfree(new); - } else { + if (!ucounts) { hlist_add_head(&new->node, hashent); get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } } + wrapped = !get_ucounts_or_wrap(ucounts); spin_unlock_irq(&ucounts_lock); + kfree(new); if (wrapped) { put_ucounts(ucounts); return NULL; -- Gitee From 926ba709626ead3a7b34d774adaa9e7949b4a8bd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 May 2025 03:45:27 +0000 Subject: [PATCH 3/9] ucount: replace get_ucounts_or_wrap() with atomic_inc_not_zero() mainline inclusion from mainline-v6.15-rc1 commit 328152e6774d9d801ad1d90af557b9113647b379 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=328152e6774d9d801ad1d90af557b9113647b379 ---------------------------------------------------------------------- get_ucounts_or_wrap() increments the counter and if the counter is negative then it decrements it again in order to reset the previous increment. This statement can be replaced with atomic_inc_not_zero() to only increment the counter if it is not yet 0. This simplifies the get function because the put (if the get failed) can be removed. atomic_inc_not_zero() is implement as a cmpxchg() loop which can be repeated several times if another get/put is performed in parallel. This will be optimized later. Increment the reference counter only if not yet dropped to zero. Link: https://lkml.kernel.org/r/20250203150525.456525-3-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: Steven Rostedt Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton Signed-off-by: Chen Ridong --- kernel/ucount.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/kernel/ucount.c b/kernel/ucount.c index 2c929c6c4784..797e6479dd91 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -151,25 +151,16 @@ static void hlist_add_ucounts(struct ucounts *ucounts) spin_unlock_irq(&ucounts_lock); } -static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) -{ - /* Returns true on a successful get, false if the count wraps. */ - return !atomic_add_negative(1, &ucounts->count); -} - struct ucounts *get_ucounts(struct ucounts *ucounts) { - if (!get_ucounts_or_wrap(ucounts)) { - put_ucounts(ucounts); - ucounts = NULL; - } - return ucounts; + if (atomic_inc_not_zero(&ucounts->count)) + return ucounts; + return NULL; } struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); - bool wrapped; struct ucounts *ucounts, *new = NULL; spin_lock_irq(&ucounts_lock); @@ -194,14 +185,11 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) return new; } } - - wrapped = !get_ucounts_or_wrap(ucounts); + if (!atomic_inc_not_zero(&ucounts->count)) + ucounts = NULL; spin_unlock_irq(&ucounts_lock); kfree(new); - if (wrapped) { - put_ucounts(ucounts); - return NULL; - } + return ucounts; } -- Gitee From e5c90e62525a05764b59509ad4d6c88bc8b6684e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 May 2025 03:45:28 +0000 Subject: [PATCH 4/9] ucount: use RCU for ucounts lookups mainline inclusion from mainline-v6.15-rc1 commit 5f01a22c5b231dd590f61a2591b3090665733bcb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f01a22c5b231dd590f61a2591b3090665733bcb ---------------------------------------------------------------------- The ucounts element is looked up under ucounts_lock. This can be optimized by using RCU for a lockless lookup and return and element if the reference can be obtained. Replace hlist_head with hlist_nulls_head which is RCU compatible. Let find_ucounts() search for the required item within a RCU section and return the item if a reference could be obtained. This means alloc_ucounts() will always return an element (unless the memory allocation failed). Let put_ucounts() RCU free the element if the reference counter dropped to zero. Link: https://lkml.kernel.org/r/20250203150525.456525-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: Steven Rostedt Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton Signed-off-by: Chen Ridong --- include/linux/user_namespace.h | 4 +- kernel/ucount.c | 75 ++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index ccd5337671e4..6b69ec3d9e66 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -112,9 +113,10 @@ struct user_namespace { } __randomize_layout; struct ucounts { - struct hlist_node node; + struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; + struct rcu_head rcu; atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; diff --git a/kernel/ucount.c b/kernel/ucount.c index 797e6479dd91..5677eb6e57c9 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -15,7 +15,10 @@ struct ucounts init_ucounts = { }; #define UCOUNTS_HASHTABLE_BITS 10 -static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS) +static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = { + [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0) +}; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ @@ -24,7 +27,6 @@ static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) - #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -132,22 +134,28 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } -static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, + struct hlist_nulls_head *hashent) { struct ucounts *ucounts; + struct hlist_nulls_node *pos; - hlist_for_each_entry(ucounts, hashent, node) { - if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) - return ucounts; + guard(rcu)(); + hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { + if (atomic_inc_not_zero(&ucounts->count)) + return ucounts; + } } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); - hlist_add_head(&ucounts->node, hashent); + hlist_nulls_add_head_rcu(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } @@ -160,37 +168,33 @@ struct ucounts *get_ucounts(struct ucounts *ucounts) struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { - struct hlist_head *hashent = ucounts_hashentry(ns, uid); - struct ucounts *ucounts, *new = NULL; + struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; + + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) + return ucounts; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { + if (ucounts) { spin_unlock_irq(&ucounts_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->ns = ns; - new->uid = uid; - atomic_set(&new->count, 1); - - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { - hlist_add_head(&new->node, hashent); - get_user_ns(new->ns); - spin_unlock_irq(&ucounts_lock); - return new; - } + kfree(new); + return ucounts; } - if (!atomic_inc_not_zero(&ucounts->count)) - ucounts = NULL; - spin_unlock_irq(&ucounts_lock); - kfree(new); - return ucounts; + hlist_nulls_add_head_rcu(&new->node, hashent); + get_user_ns(new->ns); + spin_unlock_irq(&ucounts_lock); + return new; } void put_ucounts(struct ucounts *ucounts) @@ -198,10 +202,11 @@ void put_ucounts(struct ucounts *ucounts) unsigned long flags; if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { - hlist_del_init(&ucounts->node); + hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); - kfree(ucounts); + kfree_rcu(ucounts, rcu); } } -- Gitee From e91e17c5f070a454a5a54e11efbad204a8f9d113 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 May 2025 03:45:29 +0000 Subject: [PATCH 5/9] ucount: use rcuref_t for reference counting mainline inclusion from mainline-v6.15-rc1 commit b4dc0bee2a749083028afba346910e198653f42a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b4dc0bee2a749083028afba346910e198653f42a ---------------------------------------------------------------------- Use rcuref_t for reference counting. This eliminates the cmpxchg loop in the get and put path. This also eliminates the need to acquire the lock in the put path because once the final user returns the reference, it can no longer be obtained anymore. Use rcuref_t for reference counting. Link: https://lkml.kernel.org/r/20250203150525.456525-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: Steven Rostedt Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton Signed-off-by: Chen Ridong --- include/linux/user_namespace.h | 11 +++++++++-- kernel/ucount.c | 16 +++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 6b69ec3d9e66..c3b4de67471c 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -117,7 +118,7 @@ struct ucounts { struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; - atomic_t count; + rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; @@ -130,9 +131,15 @@ void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); -struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); +static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) +{ + if (rcuref_get(&ucounts->count)) + return ucounts; + return NULL; +} + static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); diff --git a/kernel/ucount.c b/kernel/ucount.c index 5677eb6e57c9..fd2ccffe0839 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,7 +11,7 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = ATOMIC_INIT(1), + .count = RCUREF_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 @@ -143,7 +143,7 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, guard(rcu)(); hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { - if (atomic_inc_not_zero(&ucounts->count)) + if (rcuref_get(&ucounts->count)) return ucounts; } } @@ -159,13 +159,6 @@ static void hlist_add_ucounts(struct ucounts *ucounts) spin_unlock_irq(&ucounts_lock); } -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - if (atomic_inc_not_zero(&ucounts->count)) - return ucounts; - return NULL; -} - struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); @@ -181,7 +174,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - atomic_set(&new->count, 1); + rcuref_init(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -201,7 +194,8 @@ void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { + if (rcuref_put(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); -- Gitee From 89660744cccea4774e246ac40a1ffc4d6019c841 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 27 May 2025 03:45:30 +0000 Subject: [PATCH 6/9] ucounts: add CONFIG_UCOUNTS_PERCPU_COUNTER hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 -------------------------------- Add a config for ucounts. which provice percpu_counter to have a better scalability. Signed-off-by: Chen Ridong --- init/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 22d9ac8ca08f..38d41dd127c0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1343,6 +1343,15 @@ config CGROUP_FILES endif # CGROUPS +config UCOUNTS_PERCPU_COUNTER + bool "Ucount percpu counter for rlimit" + default n + help + Provide percpu_counter for ucounts, which can improve scalability + for rlimit counting. Opening this config will re-inplement rlimit + charging or uncharging using percpu_counter, avoiding atomic + operations on the common parents. + menuconfig NAMESPACES bool "Namespaces support" if EXPERT depends on MULTIUSER -- Gitee From 549ed17a8fdd4ff783bbc5ea9d1c2c2a6d89b53f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 27 May 2025 03:45:31 +0000 Subject: [PATCH 7/9] ucounts: add input for inc_rlimit_[get_]ucounts_limit hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 ---------------------------------------- Add input parameter 'limit' for inc_rlimit_[get_]ucounts[_limit] function. The 'limit' will be used for rlimit percpu_counter inplementation. No function changes now. Signed-off-by: Chen Ridong --- include/linux/user_namespace.h | 9 +++++++-- ipc/mqueue.c | 3 ++- kernel/signal.c | 2 +- kernel/ucount.c | 5 +++-- mm/mlock.c | 2 +- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index c3b4de67471c..8ec4a694d1f2 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -145,10 +145,15 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty return atomic_long_read(&ucounts->rlimit[type]); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); +long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit); +static inline long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +{ + return inc_rlimit_ucounts_limit(ucounts, type, v, LONG_MAX); +} + bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit); + bool override_rlimit, long limit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ba8215ed663a..842b31196c12 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -374,7 +374,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb, long msgqueue; spin_lock(&mq_lock); - msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + msgqueue = inc_rlimit_ucounts_limit(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, + mq_bytes, rlimit(RLIMIT_MSGQUEUE)); if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); spin_unlock(&mq_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 3ccbc61a1f09..50aa72d6ff4e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -429,7 +429,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, - override_rlimit); + override_rlimit, task_rlimit(t, RLIMIT_SIGPENDING)); rcu_read_unlock(); if (!sigpending) return NULL; diff --git a/kernel/ucount.c b/kernel/ucount.c index fd2ccffe0839..778279318e1d 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -251,7 +251,8 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, + long v, long limit) { struct ucounts *iter; long max = LONG_MAX; @@ -300,7 +301,7 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) } long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit) + bool override_rlimit, long limit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; diff --git a/mm/mlock.c b/mm/mlock.c index cd0997d89c7c..f9653d30d025 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -833,7 +833,7 @@ int user_shm_lock(size_t size, struct ucounts *ucounts) if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + memlock = inc_rlimit_ucounts_limit(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked, lock_limit); if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); -- Gitee From a078c750e1868e82557b837738f1fb90ee014785 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 27 May 2025 03:45:32 +0000 Subject: [PATCH 8/9] ucounts: reinplement rlimit with percpu_counter hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 ---------------------------------------- The will-it-scale test case signal1 [1] has been observed. and the test results reveal that the signal sending system call lacks linearity. To further investigate this issue, we initiated a series of tests by launching varying numbers of dockers and closely monitored the throughput of each individual docker. The detailed test outcomes are presented as follows: | Dockers |1 |4 |8 |16 |32 |64 | | Throughput |380068 |353204 |308948 |306453 |180659 |129152 | The data clearly demonstrates a discernible trend: as the quantity of dockers increases, the throughput per container progressively declines. In-depth analysis has identified the root cause of this performance degradation. The ucounts module conducts statistics on rlimit, which involves a significant number of atomic operations. These atomic operations, when acting on the same variable, trigger a substantial number of cache misses or remote accesses, ultimately resulting in a drop in performance. To address the above issues, this patch converts the atomic rlimit to a percpu_counter. Summing up the percpu counters is expensive. To overcome this, this patch modifies the conditions for freeing ucounts. Instead of complex checks regarding whether a pending signal is the first or the last one, the ucounts can now be freed only when both the refcount and the rlimits are zero. After the optimization, the performance data is shown below, demonstrating that the throughput no longer declines as the number of Docker containers increases: | Dockers |1 |4 |8 |16 |32 |64 | | Throughput |374737 |376377 |374814 |379284 |374950 |377509 | [1] https://github.com/antonblanchard/will-it-scale/blob/master/tests/ Signed-off-by: Chen Ridong --- include/linux/user_namespace.h | 17 ++++ init/main.c | 1 + kernel/ucount.c | 172 ++++++++++++++++++++++++++++++++- 3 files changed, 188 insertions(+), 2 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 8ec4a694d1f2..908d1bba7db1 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -13,6 +13,9 @@ #include #include #include +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +#include +#endif #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 @@ -120,7 +123,12 @@ struct ucounts { struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; +#ifndef CONFIG_UCOUNTS_PERCPU_COUNTER atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; +#else + struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS]; + atomic_long_t freed; +#endif }; extern struct user_namespace init_user_ns; @@ -132,6 +140,11 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_ty void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +void __init ucounts_init(void); +#else +static inline void __init ucounts_init(void) { } +#endif static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { @@ -142,7 +155,11 @@ static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + return percpu_counter_sum(&ucounts->rlimit[type]); +#else return atomic_long_read(&ucounts->rlimit[type]); +#endif } long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit); diff --git a/init/main.c b/init/main.c index 8fdfa69dba0f..02a2c5d9be67 100644 --- a/init/main.c +++ b/init/main.c @@ -1050,6 +1050,7 @@ void start_kernel(void) efi_enter_virtual_mode(); #endif thread_stack_cache_init(); + ucounts_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/kernel/ucount.c b/kernel/ucount.c index 778279318e1d..20145f12ee3a 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -171,7 +171,13 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; - +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + if (percpu_counter_init_many(&new->rlimit[0], 0, GFP_KERNEL_ACCOUNT, + UCOUNT_RLIMIT_COUNTS)) { + kfree(new); + return NULL; + } +#endif new->ns = ns; new->uid = uid; rcuref_init(&new->count, 1); @@ -180,6 +186,9 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) ucounts = find_ucounts(ns, uid, hashent); if (ucounts) { spin_unlock_irq(&ucounts_lock); +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + percpu_counter_destroy_many(&new->rlimit[0], UCOUNT_RLIMIT_COUNTS); +#endif kfree(new); return ucounts; } @@ -190,6 +199,65 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) return new; } +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +/* + * Whether all the rlimits are zero. + * For now, only UCOUNT_RLIMIT_SIGPENDING is considered. + * Other rlimit can be added. + */ +static bool rlimits_are_zero(struct ucounts *ucounts) +{ + int rtypes[] = { UCOUNT_RLIMIT_SIGPENDING }; + int rtype; + + for (int i = 0; i < sizeof(rtypes) / sizeof(int); ++i) { + rtype = rtypes[i]; + if (get_rlimit_value(ucounts, rtype) > 0) + return false; + } + return true; +} + +/* + * Ucounts can be freed only when the ucount->count is released + * and the rlimits are zero. + * The caller should hold rcu_read_lock(); + */ +static bool ucounts_can_be_freed(struct ucounts *ucounts) +{ + if (rcuref_read(&ucounts->count) > 0) + return false; + if (!rlimits_are_zero(ucounts)) + return false; + /* Prevent double free */ + return atomic_long_cmpxchg(&ucounts->freed, 0, 1) == 0; +} + +static void free_ucounts(struct ucounts *ucounts) +{ + unsigned long flags; + + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); + spin_unlock_irqrestore(&ucounts_lock, flags); + percpu_counter_destroy_many(&ucounts->rlimit[0], UCOUNT_RLIMIT_COUNTS); + put_user_ns(ucounts->ns); + kfree_rcu(ucounts, rcu); +} + +void put_ucounts(struct ucounts *ucounts) +{ + rcu_read_lock(); + if (rcuref_put(&ucounts->count) && + ucounts_can_be_freed(ucounts)) { + rcu_read_unlock(); + free_ucounts(ucounts); + return; + } + rcu_read_unlock(); +} +#else + void put_ucounts(struct ucounts *ucounts) { unsigned long flags; @@ -203,6 +271,7 @@ void put_ucounts(struct ucounts *ucounts) kfree_rcu(ucounts, rcu); } } +#endif // CONFIG_UCOUNTS_PERCPU_COUNTER static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { @@ -251,6 +320,105 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +/* Return 1 if increments successful, otherwise return LONG_MAX. */ +long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, + long v, long limit) +{ + struct ucounts *iter; + long max = LONG_MAX; + bool over_limit = false; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = min(limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, v)) + over_limit = true; + + max = get_userns_rlimit_max(iter->ns, type); + } + + if (over_limit) + return LONG_MAX; + return 1; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +{ + struct ucounts *iter; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) + percpu_counter_sub(&iter->rlimit[type], v); + return false; +} + +/* + * The inc_rlimit_get_ucounts does not grab the refcount. + * The rlimit_release should be called very time the rlimit is decremented. + */ +static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, + struct ucounts *last, enum rlimit_type type) +{ + struct ucounts *iter, *next; + + for (iter = ucounts; iter != last; iter = next) { + bool to_free; + + rcu_read_lock(); + percpu_counter_sub(&iter->rlimit[type], 1); + next = iter->ns->ucounts; + to_free = ucounts_can_be_freed(iter); + rcu_read_unlock(); + /* If ucounts->count is zero and the rlimits are zero, free ucounts */ + if (to_free) + free_ucounts(iter); + } +} + +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) +{ + do_dec_rlimit_put_ucounts(ucounts, NULL, type); +} + +/* + * Though this function does not grab the refcount, it is promised that the + * ucounts will not be freed as long as there have any rlimit pins to it. + * Caller must hold a reference to ucounts or under rcu_read_lock(). + * + * Return 1 if increments successful, otherwise return 0. + */ +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit, long limit) +{ + struct ucounts *iter; + long max = LONG_MAX; + long in_limit = limit; + + if (override_rlimit) + in_limit = LONG_MAX; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + /* Can not exceed the limit(inputed) or the ns->rlimit_max */ + max = min(in_limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, 1)) + goto dec_unwind; + + if (!override_rlimit) + max = get_userns_rlimit_max(iter->ns, type); + } + return 1; +dec_unwind: + do_dec_rlimit_put_ucounts(ucounts, iter, type); + return 0; +} + +void __init ucounts_init(void) +{ + if (percpu_counter_init_many(&init_ucounts.rlimit[0], 0, GFP_KERNEL, + UCOUNT_RLIMIT_COUNTS)) + panic("Cannot create init_ucounts rlimit counters"); +} +#else + long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit) { @@ -332,7 +500,7 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } - +#endif bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; -- Gitee From 7ad01e02c1b81d913ad54c930f5dd62700f2a13b Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 27 May 2025 03:45:33 +0000 Subject: [PATCH 9/9] ucount: fix kabi for ucounts hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 ---------------------------------------------------------------------- Fix kabi break caused by previous patch. Fixes: cf455a5bcc22 ("ucount: use RCU for ucounts lookups") Fixes: 23679ab075f1 ("ucount: use rcuref_t for reference counting") Fixes: 6b0277f0afa1 ("ucounts: reinplement rlimit with percpu_counter") Signed-off-by: Chen Ridong --- include/linux/user_namespace.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 908d1bba7db1..37517ed7489a 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -117,18 +117,21 @@ struct user_namespace { } __randomize_layout; struct ucounts { - struct hlist_nulls_node node; + KABI_DEPRECATE(struct hlist_node, node) struct user_namespace *ns; kuid_t uid; - struct rcu_head rcu; - rcuref_t count; + KABI_DEPRECATE(atomic_t, count) atomic_long_t ucount[UCOUNT_COUNTS]; #ifndef CONFIG_UCOUNTS_PERCPU_COUNTER atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; #else - struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS]; - atomic_long_t freed; + KABI_DEPRECATE(atomic_long_t, rlimit[UCOUNT_RLIMIT_COUNTS]) + KABI_EXTEND(struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS]) + KABI_EXTEND(atomic_long_t freed) #endif + KABI_EXTEND(struct hlist_nulls_node node) + KABI_EXTEND(struct rcu_head rcu) + KABI_EXTEND(rcuref_t count) }; extern struct user_namespace init_user_ns; -- Gitee