diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index fa6e8471bd2278f939b7084bd1b2655ef253df71..248db9b77ee246c2ac6346c38fc4ec2830a9be72 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -28,6 +28,7 @@ struct hlist_nulls_node { #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) +#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index ccd5337671e434bc6b7be32bccd57671a1505aee..ac928e6e704272eea63d0bde02471b816d02016f 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -5,12 +5,15 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 @@ -112,12 +115,26 @@ struct user_namespace { } __randomize_layout; struct ucounts { +#ifdef __GENKSYMS__ struct hlist_node node; +#else + struct hlist_nulls_node node; +#endif struct user_namespace *ns; kuid_t uid; +#ifdef __GENKSYMS__ atomic_t count; +#else + struct rcu_head rcu; + rcuref_t count; + atomic_long_t freed; +#endif atomic_long_t ucount[UCOUNT_COUNTS]; +#ifdef __GENKSYMS__ atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; +#else + struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS]; +#endif }; extern struct user_namespace init_user_ns; @@ -128,18 +145,29 @@ void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); -struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); +void __init ucounts_init(void); + +static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) +{ + if (rcuref_get(&ucounts->count)) + return ucounts; + return NULL; +} static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { - return atomic_long_read(&ucounts->rlimit[type]); + return percpu_counter_sum(&ucounts->rlimit[type]); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); -bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); +bool inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit); +static inline bool inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +{ + return inc_rlimit_ucounts_limit(ucounts, type, v, LONG_MAX); +} +void dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit); + bool override_rlimit, long limit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); diff --git a/init/main.c b/init/main.c index 8fdfa69dba0fa9b357db0d4af16f2313d5658ecd..02a2c5d9be671b4aac158951a5551ad9524f34d8 100644 --- a/init/main.c +++ b/init/main.c @@ -1050,6 +1050,7 @@ void start_kernel(void) efi_enter_virtual_mode(); #endif thread_stack_cache_init(); + ucounts_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ba8215ed663a43911907bc6d0511335cb5821433..a910c93bea08a13ee05b05bde943eb662245aa4c 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -371,11 +371,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb, mq_bytes += mq_treesize; info->ucounts = get_ucounts(current_ucounts()); if (info->ucounts) { - long msgqueue; - spin_lock(&mq_lock); - msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); - if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { + if (!inc_rlimit_ucounts_limit(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, + mq_bytes, rlimit(RLIMIT_MSGQUEUE))) { dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); spin_unlock(&mq_lock); put_ucounts(info->ucounts); diff --git a/kernel/signal.c b/kernel/signal.c index 3ccbc61a1f09bb981027cb86b18eb6a1976d394b..a7b3265f1db7b0f2574c951717a52ec3294b9f73 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -429,17 +429,14 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, - override_rlimit); + override_rlimit, task_rlimit(t, RLIMIT_SIGPENDING)); rcu_read_unlock(); - if (!sigpending) - return NULL; - - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); - } else { + if (!sigpending) { print_dropped_signal(sig); + return NULL; } + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); if (unlikely(q == NULL)) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); } else { diff --git a/kernel/ucount.c b/kernel/ucount.c index 584b73807c445d640437e9df42c6c4c9fc8612bc..bdaa6261b7cae3f29d927dc8720b1adcef4f82f3 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,11 +11,14 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = ATOMIC_INIT(1), + .count = RCUREF_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 -static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS) +static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = { + [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0) +}; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ @@ -24,7 +27,6 @@ static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) - #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -132,89 +134,123 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } -static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, + struct hlist_nulls_head *hashent) { struct ucounts *ucounts; + struct hlist_nulls_node *pos; - hlist_for_each_entry(ucounts, hashent, node) { - if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) - return ucounts; + guard(rcu)(); + hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { + if (rcuref_get(&ucounts->count)) + return ucounts; + } } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); - hlist_add_head(&ucounts->node, hashent); + hlist_nulls_add_head_rcu(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } -static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) -{ - /* Returns true on a successful get, false if the count wraps. */ - return !atomic_add_negative(1, &ucounts->count); -} - -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - if (!get_ucounts_or_wrap(ucounts)) { - put_ucounts(ucounts); - ucounts = NULL; - } - return ucounts; -} - struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { - struct hlist_head *hashent = ucounts_hashentry(ns, uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; - bool wrapped; + + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) + return ucounts; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + rcuref_init(&new->count, 1); + if (percpu_counter_init_many(&new->rlimit[0], 0, GFP_KERNEL_ACCOUNT, + UCOUNT_RLIMIT_COUNTS)) { + kfree(new); + return NULL; + } spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { + if (ucounts) { spin_unlock_irq(&ucounts_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->ns = ns; - new->uid = uid; - atomic_set(&new->count, 1); - - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (ucounts) { - kfree(new); - } else { - hlist_add_head(&new->node, hashent); - get_user_ns(new->ns); - spin_unlock_irq(&ucounts_lock); - return new; - } + percpu_counter_destroy_many(&new->rlimit[0], UCOUNT_RLIMIT_COUNTS); + kfree(new); + return ucounts; } - wrapped = !get_ucounts_or_wrap(ucounts); + + hlist_nulls_add_head_rcu(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); - if (wrapped) { - put_ucounts(ucounts); - return NULL; + return new; +} + +/* + * Whether all the rlimits are zero. + * For now, only UCOUNT_RLIMIT_SIGPENDING is considered. + * Other rlimit can be added. + */ +static bool rlimits_are_zero(struct ucounts *ucounts) +{ + int rtypes[] = { UCOUNT_RLIMIT_SIGPENDING }; + int rtype; + + for (int i = 0; i < sizeof(rtypes) / sizeof(int); ++i) { + rtype = rtypes[i]; + if (get_rlimit_value(ucounts, rtype) > 0) + return false; } - return ucounts; + return true; } -void put_ucounts(struct ucounts *ucounts) +/* + * Ucounts can be freed only when the ucount->count is released + * and the rlimits are zero. + * The caller should hold rcu_read_lock(); + */ +static bool ucounts_can_be_freed(struct ucounts *ucounts) +{ + if (rcuref_read(&ucounts->count) > 0) + return false; + if (!rlimits_are_zero(ucounts)) + return false; + /* Prevent double free */ + return atomic_long_cmpxchg(&ucounts->freed, 0, 1) == 0; +} + +static void free_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { - hlist_del_init(&ucounts->node); - spin_unlock_irqrestore(&ucounts_lock, flags); - put_user_ns(ucounts->ns); - kfree(ucounts); + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); + spin_unlock_irqrestore(&ucounts_lock, flags); + percpu_counter_destroy_many(&ucounts->rlimit[0], UCOUNT_RLIMIT_COUNTS); + put_user_ns(ucounts->ns); + kfree_rcu(ucounts, rcu); +} + +void put_ucounts(struct ucounts *ucounts) +{ + rcu_read_lock(); + if (rcuref_put(&ucounts->count) && + ucounts_can_be_freed(ucounts)) { + rcu_read_unlock(); + free_ucounts(ucounts); + return; } + rcu_read_unlock(); } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) @@ -264,46 +300,50 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +bool inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, + long v, long limit) { struct ucounts *iter; long max = LONG_MAX; - long ret = 0; + bool good = true; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(v, &iter->rlimit[type]); - if (new < 0 || new > max) - ret = LONG_MAX; - else if (iter == ucounts) - ret = new; + max = min(limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, v)) + good = false; + max = get_userns_rlimit_max(iter->ns, type); } - return ret; + return good; } -bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +void dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; - long new = -1; /* Silence compiler warning */ - for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long dec = atomic_long_sub_return(v, &iter->rlimit[type]); - WARN_ON_ONCE(dec < 0); - if (iter == ucounts) - new = dec; - } - return (new == 0); + + for (iter = ucounts; iter; iter = iter->ns->ucounts) + percpu_counter_sub(&iter->rlimit[type], v); } +/* + * The inc_rlimit_get_ucounts does not grab the refcount. + * The rlimit_release should be called very time the rlimit is decremented. + */ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, struct ucounts *last, enum rlimit_type type) { struct ucounts *iter, *next; for (iter = ucounts; iter != last; iter = next) { - long dec = atomic_long_sub_return(1, &iter->rlimit[type]); - WARN_ON_ONCE(dec < 0); + bool to_free; + + rcu_read_lock(); + percpu_counter_sub(&iter->rlimit[type], 1); next = iter->ns->ucounts; - if (dec == 0) - put_ucounts(iter); + to_free = ucounts_can_be_freed(iter); + rcu_read_unlock(); + /* If ucounts->count is zero and the rlimits are zero, free ucounts */ + if (to_free) + free_ucounts(iter); } } @@ -312,37 +352,37 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) do_dec_rlimit_put_ucounts(ucounts, NULL, type); } +/* + * Though this function does not grab the refcount, it is promised that the + * ucounts will not be freed as long as there have any rlimit pins to it. + * Caller must hold a reference to ucounts or under rcu_read_lock(). + * + * Return 1 if increments successful, otherwise return 0. + */ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit) + bool override_rlimit, long limit) { - /* Caller must hold a reference to ucounts */ struct ucounts *iter; long max = LONG_MAX; - long dec, ret = 0; + long ret = 0; + long in_limit = limit; + + if (override_rlimit) + in_limit = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(1, &iter->rlimit[type]); - if (new < 0 || new > max) + /* Can not exceed the limit(inputed) or the ns->rlimit_max */ + max = min(in_limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, 1)) goto dec_unwind; - if (iter == ucounts) - ret = new; + if (!override_rlimit) max = get_userns_rlimit_max(iter->ns, type); - /* - * Grab an extra ucount reference for the caller when - * the rlimit count was previously 0. - */ - if (new != 1) - continue; - if (!get_ucounts(iter)) - goto dec_unwind; } - return ret; + return 1; dec_unwind: - dec = atomic_long_sub_return(1, &iter->rlimit[type]); - WARN_ON_ONCE(dec < 0); do_dec_rlimit_put_ucounts(ucounts, iter, type); - return 0; + return ret; } bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) @@ -351,15 +391,24 @@ bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigne long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long val = get_rlimit_value(iter, type); - if (val < 0 || val > max) + /* iter->rlimit[type] > max return 1 */ + if (percpu_counter_compare(&iter->rlimit[type], max) > 0) return true; + max = get_userns_rlimit_max(iter->ns, type); } return false; } +void __init ucounts_init(void) +{ + if (percpu_counter_init_many(&init_ucounts.rlimit[0], 0, GFP_KERNEL, + UCOUNT_RLIMIT_COUNTS)) + panic("Cannot create init_ucounts rlimit counters"); +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL diff --git a/mm/mlock.c b/mm/mlock.c index cd0997d89c7c54c8f820fcfc8084d7ae4b474d16..65e5c40c2679543a8f5c27479fb5ce675d1240fa 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -825,7 +825,6 @@ static DEFINE_SPINLOCK(shmlock_user_lock); int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; - long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -833,9 +832,9 @@ int user_shm_lock(size_t size, struct ucounts *ucounts) if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); - if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { + if (!inc_rlimit_ucounts_limit(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked, lock_limit) + && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; }