diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index fa6e8471bd2278f939b7084bd1b2655ef253df71..248db9b77ee246c2ac6346c38fc4ec2830a9be72 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -28,6 +28,7 @@ struct hlist_nulls_node { #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) +#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index ccd5337671e434bc6b7be32bccd57671a1505aee..37517ed7489a4ee44eb7f0078d4012d5a1f4d97e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -5,12 +5,17 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +#include +#endif #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 @@ -112,12 +117,21 @@ struct user_namespace { } __randomize_layout; struct ucounts { - struct hlist_node node; + KABI_DEPRECATE(struct hlist_node, node) struct user_namespace *ns; kuid_t uid; - atomic_t count; + KABI_DEPRECATE(atomic_t, count) atomic_long_t ucount[UCOUNT_COUNTS]; +#ifndef CONFIG_UCOUNTS_PERCPU_COUNTER atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; +#else + KABI_DEPRECATE(atomic_long_t, rlimit[UCOUNT_RLIMIT_COUNTS]) + KABI_EXTEND(struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS]) + KABI_EXTEND(atomic_long_t freed) +#endif + KABI_EXTEND(struct hlist_nulls_node node) + KABI_EXTEND(struct rcu_head rcu) + KABI_EXTEND(rcuref_t count) }; extern struct user_namespace init_user_ns; @@ -128,18 +142,38 @@ void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); -struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +void __init ucounts_init(void); +#else +static inline void __init ucounts_init(void) { } +#endif + +static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) +{ + if (rcuref_get(&ucounts->count)) + return ucounts; + return NULL; +} static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + return percpu_counter_sum(&ucounts->rlimit[type]); +#else return atomic_long_read(&ucounts->rlimit[type]); +#endif +} + +long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit); +static inline long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +{ + return inc_rlimit_ucounts_limit(ucounts, type, v, LONG_MAX); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit); + bool override_rlimit, long limit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); diff --git a/init/Kconfig b/init/Kconfig index 22d9ac8ca08fadf435599b48515d3485e5ac76d5..38d41dd127c0cad038951c8a4c4eb4d00ba322d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1343,6 +1343,15 @@ config CGROUP_FILES endif # CGROUPS +config UCOUNTS_PERCPU_COUNTER + bool "Ucount percpu counter for rlimit" + default n + help + Provide percpu_counter for ucounts, which can improve scalability + for rlimit counting. Opening this config will re-inplement rlimit + charging or uncharging using percpu_counter, avoiding atomic + operations on the common parents. + menuconfig NAMESPACES bool "Namespaces support" if EXPERT depends on MULTIUSER diff --git a/init/main.c b/init/main.c index 8fdfa69dba0fa9b357db0d4af16f2313d5658ecd..02a2c5d9be671b4aac158951a5551ad9524f34d8 100644 --- a/init/main.c +++ b/init/main.c @@ -1050,6 +1050,7 @@ void start_kernel(void) efi_enter_virtual_mode(); #endif thread_stack_cache_init(); + ucounts_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ba8215ed663a43911907bc6d0511335cb5821433..842b31196c1277539a93a5dabd950876868644d5 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -374,7 +374,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb, long msgqueue; spin_lock(&mq_lock); - msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + msgqueue = inc_rlimit_ucounts_limit(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, + mq_bytes, rlimit(RLIMIT_MSGQUEUE)); if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); spin_unlock(&mq_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 3ccbc61a1f09bb981027cb86b18eb6a1976d394b..50aa72d6ff4e1910df7281b5b2cbaaa0a0f9568f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -429,7 +429,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, - override_rlimit); + override_rlimit, task_rlimit(t, RLIMIT_SIGPENDING)); rcu_read_unlock(); if (!sigpending) return NULL; diff --git a/kernel/ucount.c b/kernel/ucount.c index 584b73807c445d640437e9df42c6c4c9fc8612bc..20145f12ee3a81d4d4f06cf1313183aba6f32c33 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,11 +11,14 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = ATOMIC_INIT(1), + .count = RCUREF_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 -static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS) +static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = { + [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0) +}; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ @@ -24,7 +27,6 @@ static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) - #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -132,90 +134,144 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } -static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, + struct hlist_nulls_head *hashent) { struct ucounts *ucounts; + struct hlist_nulls_node *pos; - hlist_for_each_entry(ucounts, hashent, node) { - if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) - return ucounts; + guard(rcu)(); + hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { + if (rcuref_get(&ucounts->count)) + return ucounts; + } } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); - hlist_add_head(&ucounts->node, hashent); + hlist_nulls_add_head_rcu(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } -static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { - /* Returns true on a successful get, false if the count wraps. */ - return !atomic_add_negative(1, &ucounts->count); + struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; + + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) + return ucounts; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + if (percpu_counter_init_many(&new->rlimit[0], 0, GFP_KERNEL_ACCOUNT, + UCOUNT_RLIMIT_COUNTS)) { + kfree(new); + return NULL; + } +#endif + new->ns = ns; + new->uid = uid; + rcuref_init(&new->count, 1); + + spin_lock_irq(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) { + spin_unlock_irq(&ucounts_lock); +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + percpu_counter_destroy_many(&new->rlimit[0], UCOUNT_RLIMIT_COUNTS); +#endif + kfree(new); + return ucounts; + } + + hlist_nulls_add_head_rcu(&new->node, hashent); + get_user_ns(new->ns); + spin_unlock_irq(&ucounts_lock); + return new; } -struct ucounts *get_ucounts(struct ucounts *ucounts) +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +/* + * Whether all the rlimits are zero. + * For now, only UCOUNT_RLIMIT_SIGPENDING is considered. + * Other rlimit can be added. + */ +static bool rlimits_are_zero(struct ucounts *ucounts) { - if (!get_ucounts_or_wrap(ucounts)) { - put_ucounts(ucounts); - ucounts = NULL; + int rtypes[] = { UCOUNT_RLIMIT_SIGPENDING }; + int rtype; + + for (int i = 0; i < sizeof(rtypes) / sizeof(int); ++i) { + rtype = rtypes[i]; + if (get_rlimit_value(ucounts, rtype) > 0) + return false; } - return ucounts; + return true; } -struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) +/* + * Ucounts can be freed only when the ucount->count is released + * and the rlimits are zero. + * The caller should hold rcu_read_lock(); + */ +static bool ucounts_can_be_freed(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ns, uid); - struct ucounts *ucounts, *new; - bool wrapped; + if (rcuref_read(&ucounts->count) > 0) + return false; + if (!rlimits_are_zero(ucounts)) + return false; + /* Prevent double free */ + return atomic_long_cmpxchg(&ucounts->freed, 0, 1) == 0; +} - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { - spin_unlock_irq(&ucounts_lock); +static void free_ucounts(struct ucounts *ucounts) +{ + unsigned long flags; - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->ns = ns; - new->uid = uid; - atomic_set(&new->count, 1); - - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (ucounts) { - kfree(new); - } else { - hlist_add_head(&new->node, hashent); - get_user_ns(new->ns); - spin_unlock_irq(&ucounts_lock); - return new; - } - } - wrapped = !get_ucounts_or_wrap(ucounts); - spin_unlock_irq(&ucounts_lock); - if (wrapped) { - put_ucounts(ucounts); - return NULL; + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); + spin_unlock_irqrestore(&ucounts_lock, flags); + percpu_counter_destroy_many(&ucounts->rlimit[0], UCOUNT_RLIMIT_COUNTS); + put_user_ns(ucounts->ns); + kfree_rcu(ucounts, rcu); +} + +void put_ucounts(struct ucounts *ucounts) +{ + rcu_read_lock(); + if (rcuref_put(&ucounts->count) && + ucounts_can_be_freed(ucounts)) { + rcu_read_unlock(); + free_ucounts(ucounts); + return; } - return ucounts; + rcu_read_unlock(); } +#else void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { - hlist_del_init(&ucounts->node); + if (rcuref_put(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); - kfree(ucounts); + kfree_rcu(ucounts, rcu); } } +#endif // CONFIG_UCOUNTS_PERCPU_COUNTER static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { @@ -264,7 +320,107 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +/* Return 1 if increments successful, otherwise return LONG_MAX. */ +long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, + long v, long limit) +{ + struct ucounts *iter; + long max = LONG_MAX; + bool over_limit = false; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = min(limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, v)) + over_limit = true; + + max = get_userns_rlimit_max(iter->ns, type); + } + + if (over_limit) + return LONG_MAX; + return 1; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +{ + struct ucounts *iter; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) + percpu_counter_sub(&iter->rlimit[type], v); + return false; +} + +/* + * The inc_rlimit_get_ucounts does not grab the refcount. + * The rlimit_release should be called very time the rlimit is decremented. + */ +static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, + struct ucounts *last, enum rlimit_type type) +{ + struct ucounts *iter, *next; + + for (iter = ucounts; iter != last; iter = next) { + bool to_free; + + rcu_read_lock(); + percpu_counter_sub(&iter->rlimit[type], 1); + next = iter->ns->ucounts; + to_free = ucounts_can_be_freed(iter); + rcu_read_unlock(); + /* If ucounts->count is zero and the rlimits are zero, free ucounts */ + if (to_free) + free_ucounts(iter); + } +} + +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) +{ + do_dec_rlimit_put_ucounts(ucounts, NULL, type); +} + +/* + * Though this function does not grab the refcount, it is promised that the + * ucounts will not be freed as long as there have any rlimit pins to it. + * Caller must hold a reference to ucounts or under rcu_read_lock(). + * + * Return 1 if increments successful, otherwise return 0. + */ +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit, long limit) +{ + struct ucounts *iter; + long max = LONG_MAX; + long in_limit = limit; + + if (override_rlimit) + in_limit = LONG_MAX; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + /* Can not exceed the limit(inputed) or the ns->rlimit_max */ + max = min(in_limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, 1)) + goto dec_unwind; + + if (!override_rlimit) + max = get_userns_rlimit_max(iter->ns, type); + } + return 1; +dec_unwind: + do_dec_rlimit_put_ucounts(ucounts, iter, type); + return 0; +} + +void __init ucounts_init(void) +{ + if (percpu_counter_init_many(&init_ucounts.rlimit[0], 0, GFP_KERNEL, + UCOUNT_RLIMIT_COUNTS)) + panic("Cannot create init_ucounts rlimit counters"); +} +#else + +long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, + long v, long limit) { struct ucounts *iter; long max = LONG_MAX; @@ -313,7 +469,7 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) } long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit) + bool override_rlimit, long limit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; @@ -344,7 +500,7 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } - +#endif bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; diff --git a/mm/mlock.c b/mm/mlock.c index cd0997d89c7c54c8f820fcfc8084d7ae4b474d16..f9653d30d02567d2fb58050c53fba73b22a601e9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -833,7 +833,7 @@ int user_shm_lock(size_t size, struct ucounts *ucounts) if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + memlock = inc_rlimit_ucounts_limit(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked, lock_limit); if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);