diff --git a/fs/exec.c b/fs/exec.c index ee71a315cc51f51a066a552614fd10bdecd32188..53121a6edd10d50fde36d5f94d8d9b6ea0385b21 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1484,6 +1484,7 @@ void finalize_exec(struct linux_binprm *bprm) /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + current->futex_nid = NUMA_NO_NODE; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); diff --git a/include/linux/futex.h b/include/linux/futex.h index b70df27d7e85c2c257c3467780d7b835b58d9fe1..5cdcfe4b4ab93ec04653601b38c0e18260e58daf 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -69,6 +69,7 @@ static inline void futex_init_task(struct task_struct *tsk) tsk->pi_state_cache = NULL; tsk->futex_state = FUTEX_STATE_OK; mutex_init(&tsk->futex_exit_mutex); + tsk->futex_nid = NUMA_NO_NODE; } void futex_exit_recursive(struct task_struct *tsk); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ed64240041e85775ab0f6e0a3a5055d106523657..004243557ddfb4812c373dc9752517d1561e7505 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -423,18 +423,27 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); -static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) +static __always_inline void *memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, - MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); +} + +static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_nid(size, align, NUMA_NO_NODE); +} + +static inline void *memblock_alloc_raw_nid(phys_addr_t size, phys_addr_t align, int nid) +{ + return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); } static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { - return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT, - MEMBLOCK_ALLOC_ACCESSIBLE, - NUMA_NO_NODE); + return memblock_alloc_raw_nid(size, align, NUMA_NO_NODE); } static inline void *memblock_alloc_from(phys_addr_t size, @@ -573,7 +582,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region < (memblock.reserved.regions + memblock.reserved.cnt); \ region++) -extern void *alloc_large_system_hash(const char *tablename, +extern void *alloc_large_system_hash_nid(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, @@ -581,7 +590,32 @@ extern void *alloc_large_system_hash(const char *tablename, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long low_limit, - unsigned long high_limit); + unsigned long high_limit, + int nid); + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +static __always_inline void *alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long low_limit, + unsigned long high_limit) +{ + return alloc_large_system_hash_nid(tablename, bucketsize, + numentries, scale, flags, + _hash_shift, _hash_mask, + low_limit, high_limit, + NUMA_NO_NODE); +} + #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ #define HASH_ZERO 0x00000002 /* Zero allocated hash table */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 217a58817641fec6215086a68f3387fdebe2ef30..e1d86abf2facc9d42c2f3931c32b0a1feea21027 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1328,6 +1328,7 @@ struct task_struct { struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; + unsigned int futex_nid; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8ddde9cc947c685e64923139c66c3f9..f1afe4d353913a596a7405ea4ba4f6c38384f6cd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -152,6 +152,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1); extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index f30a93e50f65e8c80a95b398dc5c2b563fab14fc..e64914510c9609bfca293a845d23f455b3bae219 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -47,7 +47,8 @@ * reside in the same cacheline. */ static struct { - struct futex_hash_bucket *queues; + /* one more is for shared futex hash table queue */ + struct futex_hash_bucket *queues[MAX_NUMNODES + 1]; unsigned long hashsize; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) @@ -105,6 +106,31 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ +static bool per_numa_node_futex = true; +static int __init setup_per_numa_node_futex(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + per_numa_node_futex = true; + ret = 1; + } else if (!strcmp(str, "disable")) { + per_numa_node_futex = false; + ret = 1; + } + +out: + if (!ret) + pr_warn("Unable to parse per_numa_node_futex=\n"); + + return ret; +} +__setup("per_numa_node_futex=", setup_per_numa_node_futex); + + /** * futex_hash - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated @@ -114,10 +140,18 @@ late_initcall(fail_futex_debugfs); */ struct futex_hash_bucket *futex_hash(union futex_key *key) { + int idx; + + if (!per_numa_node_futex || + (key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE))) + idx = MAX_NUMNODES; + else + idx = READ_ONCE(current->group_leader->futex_nid); + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); - return &futex_queues[hash & (futex_hashsize - 1)]; + return &futex_queues[idx][hash & (futex_hashsize - 1)]; } @@ -240,6 +274,13 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, if (unlikely(should_fail_futex(fshared))) return -EFAULT; + if (per_numa_node_futex && + READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) { + int id = numa_node_id(); + + cmpxchg(¤t->group_leader->futex_nid, NUMA_NO_NODE, id); + } + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -1130,29 +1171,59 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } +static struct futex_hash_bucket* __init +alloc_futex_hash(const char *tablename, int nid, int hash_size) +{ + struct futex_hash_bucket *fhb; + unsigned int shift; + + fhb = alloc_large_system_hash_nid(tablename, + sizeof(struct futex_hash_bucket), + hash_size, 0, 0, &shift, NULL, + hash_size, hash_size, nid); + + hash_size = 1UL << shift; + for (int i = 0; i < hash_size; i++) { + atomic_set(&fhb[i].waiters, 0); + plist_head_init(&fhb[i].chain); + spin_lock_init(&fhb[i].lock); + } + + return fhb; +} + static int __init futex_init(void) { - unsigned int futex_shift; - unsigned long i; + unsigned int nid; #if CONFIG_BASE_SMALL futex_hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + if (per_numa_node_futex) { + futex_hashsize = 256 * num_possible_cpus(); + futex_hashsize /= num_possible_nodes(); + /* 32 is larger than 16 and not that too much */ + futex_hashsize = max(32, futex_hashsize); + futex_hashsize = roundup_pow_of_two(futex_hashsize); + } else { + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + } #endif - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, - &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; - - for (i = 0; i < futex_hashsize; i++) { - atomic_set(&futex_queues[i].waiters, 0); - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); + if (per_numa_node_futex) { + for_each_node(nid) + futex_queues[nid] = alloc_futex_hash("private futex", + nid, futex_hashsize); } + /* + * For shared futex or if per numa node futex is disabled, futex hash + * table could be accessed from different processes.Can not use + * per-process index for futex hash. Use global hash table instead. + */ + futex_queues[MAX_NUMNODES] = alloc_futex_hash("shared futex", NUMA_NO_NODE, + futex_hashsize); + return 0; } core_initcall(futex_init); diff --git a/mm/mm_init.c b/mm/mm_init.c index 77fd04c83d046db29912b7f3afed3339a067318d..955d9d7d8cade08527a1ff5536a919ace11af89e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2449,12 +2449,12 @@ static unsigned long __init arch_reserved_kernel_pages(void) #endif /* - * allocate a large system hash table from bootmem + * allocate a large system hash table from bootmem on specific numa node * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries * - limit is the number of hash buckets, not the total allocation size */ -void *__init alloc_large_system_hash(const char *tablename, +void *__init alloc_large_system_hash_nid(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, @@ -2462,7 +2462,8 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long low_limit, - unsigned long high_limit) + unsigned long high_limit, + int nid) { unsigned long long max = high_limit; unsigned long log2qty, size; @@ -2522,12 +2523,11 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_alloc(size, SMP_CACHE_BYTES); + table = memblock_alloc_nid(size, SMP_CACHE_BYTES, nid); else - table = memblock_alloc_raw(size, - SMP_CACHE_BYTES); + table = memblock_alloc_raw_nid(size, SMP_CACHE_BYTES, nid); } else if (get_order(size) > MAX_ORDER || hashdist) { - table = vmalloc_huge(size, gfp_flags); + table = vmalloc_huge_node(size, gfp_flags, nid); virt = true; if (table) huge = is_vm_area_hugepages(table); @@ -2537,7 +2537,7 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - table = alloc_pages_exact(size, gfp_flags); + table = alloc_pages_exact_nid(nid, size, gfp_flags); kmemleak_alloc(table, size, 1, gfp_flags); } } while (!table && size > PAGE_SIZE && --log2qty); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb947787f25da2af38ea353257f833fa1227dd93..da8de24d121e7db8767f049bb06152ece3d83ab8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3451,6 +3451,14 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(vmalloc_huge); +void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node) +{ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(vmalloc_huge_node); + /** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size