From 0828f422c01eed486d8200029aa4d63733b6a83e Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Tue, 27 May 2025 09:55:20 +0800 Subject: [PATCH 1/5] anolis: mm: add _node version for several memory allocation APIs ANBZ: #21547 To enable node specific futex hash-tables memory allocations. Signed-off-by: Yin Fengwei Reviewed-by: Huang Ying --- include/linux/memblock.h | 19 ++++++++++++++----- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 8 ++++++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ed64240041e8..ad0d90ecb457 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -423,18 +423,27 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); -static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) +static __always_inline void *memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, - MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); +} + +static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_nid(size, align, NUMA_NO_NODE); +} + +static inline void *memblock_alloc_raw_nid(phys_addr_t size, phys_addr_t align, int nid) +{ + return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); } static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { - return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT, - MEMBLOCK_ALLOC_ACCESSIBLE, - NUMA_NO_NODE); + return memblock_alloc_raw_nid(size, align, NUMA_NO_NODE); } static inline void *memblock_alloc_from(phys_addr_t size, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8dd..f1afe4d35391 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -152,6 +152,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1); extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb947787f25d..da8de24d121e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3451,6 +3451,14 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(vmalloc_huge); +void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node) +{ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(vmalloc_huge_node); + /** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size -- Gitee From 331de053fda1218bfb2ac7245da217b3b675b64d Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Tue, 27 May 2025 10:50:46 +0800 Subject: [PATCH 2/5] anolis: mm/mm_init: add function to allocate large hash from numa node ANBZ: #21547 To allow callers define specific numa node which memory will come from. Signed-off-by: Yin Fengwei Reviewed-by: Huang Ying --- include/linux/memblock.h | 29 +++++++++++++++++++++++++++-- mm/mm_init.c | 16 ++++++++-------- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ad0d90ecb457..004243557ddf 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -582,7 +582,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region < (memblock.reserved.regions + memblock.reserved.cnt); \ region++) -extern void *alloc_large_system_hash(const char *tablename, +extern void *alloc_large_system_hash_nid(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, @@ -590,7 +590,32 @@ extern void *alloc_large_system_hash(const char *tablename, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long low_limit, - unsigned long high_limit); + unsigned long high_limit, + int nid); + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +static __always_inline void *alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long low_limit, + unsigned long high_limit) +{ + return alloc_large_system_hash_nid(tablename, bucketsize, + numentries, scale, flags, + _hash_shift, _hash_mask, + low_limit, high_limit, + NUMA_NO_NODE); +} + #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ #define HASH_ZERO 0x00000002 /* Zero allocated hash table */ diff --git a/mm/mm_init.c b/mm/mm_init.c index 77fd04c83d04..955d9d7d8cad 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2449,12 +2449,12 @@ static unsigned long __init arch_reserved_kernel_pages(void) #endif /* - * allocate a large system hash table from bootmem + * allocate a large system hash table from bootmem on specific numa node * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries * - limit is the number of hash buckets, not the total allocation size */ -void *__init alloc_large_system_hash(const char *tablename, +void *__init alloc_large_system_hash_nid(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, @@ -2462,7 +2462,8 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long low_limit, - unsigned long high_limit) + unsigned long high_limit, + int nid) { unsigned long long max = high_limit; unsigned long log2qty, size; @@ -2522,12 +2523,11 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_alloc(size, SMP_CACHE_BYTES); + table = memblock_alloc_nid(size, SMP_CACHE_BYTES, nid); else - table = memblock_alloc_raw(size, - SMP_CACHE_BYTES); + table = memblock_alloc_raw_nid(size, SMP_CACHE_BYTES, nid); } else if (get_order(size) > MAX_ORDER || hashdist) { - table = vmalloc_huge(size, gfp_flags); + table = vmalloc_huge_node(size, gfp_flags, nid); virt = true; if (table) huge = is_vm_area_hugepages(table); @@ -2537,7 +2537,7 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - table = alloc_pages_exact(size, gfp_flags); + table = alloc_pages_exact_nid(nid, size, gfp_flags); kmemleak_alloc(table, size, 1, gfp_flags); } } while (!table && size > PAGE_SIZE && --log2qty); -- Gitee From 67a36635d56180e0463580ca6fd04759395e1160 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Tue, 27 May 2025 11:20:35 +0800 Subject: [PATCH 3/5] anolis: futex: extend futex hash table to allow per numa node hash table ANBZ: #21547 Extend futex hash table to an array which will hold the pointer to the memory from all possible numa nodes. Allocate one more futex hash table for shared futex. As shared futex can be accessed from different processes, we can not use per process index for shared futex hash table. So just still use a global futex hash table for it. Signed-off-by: Yin Fengwei Reviewed-by: Huang Ying --- kernel/futex/core.c | 54 ++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index f30a93e50f65..c1cda5e97d78 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -47,7 +47,8 @@ * reside in the same cacheline. */ static struct { - struct futex_hash_bucket *queues; + /* one more is for shared futex hash table queue */ + struct futex_hash_bucket *queues[MAX_NUMNODES + 1]; unsigned long hashsize; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) @@ -117,7 +118,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); - return &futex_queues[hash & (futex_hashsize - 1)]; + return &futex_queues[0][hash & (futex_hashsize - 1)]; } @@ -1130,28 +1131,51 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } +static struct futex_hash_bucket* __init +alloc_futex_hash(const char *tablename, int nid, int hash_size) +{ + struct futex_hash_bucket *fhb; + unsigned int shift; + + fhb = alloc_large_system_hash_nid(tablename, + sizeof(struct futex_hash_bucket), + hash_size, 0, 0, &shift, NULL, + hash_size, hash_size, nid); + + hash_size = 1UL << shift; + for (int i = 0; i < hash_size; i++) { + atomic_set(&fhb[i].waiters, 0); + plist_head_init(&fhb[i].chain); + spin_lock_init(&fhb[i].lock); + } + + return fhb; +} + static int __init futex_init(void) { - unsigned int futex_shift; - unsigned long i; + unsigned int nid; #if CONFIG_BASE_SMALL futex_hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + futex_hashsize = 256 * num_possible_cpus(); + futex_hashsize /= num_possible_nodes(); + /* 32 is larger than 16 and not that too much */ + futex_hashsize = max(32, futex_hashsize); + futex_hashsize = roundup_pow_of_two(futex_hashsize); #endif - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, - &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; + for_each_node(nid) + futex_queues[nid] = alloc_futex_hash("private futex", + nid, futex_hashsize); - for (i = 0; i < futex_hashsize; i++) { - atomic_set(&futex_queues[i].waiters, 0); - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); - } + /* + * For shared futex, it could be accessed from different processes. + * Can not use per-process index for futex hash. Use global hash table. + */ + futex_queues[MAX_NUMNODES] = alloc_futex_hash("shared futex", NUMA_NO_NODE, + futex_hashsize); return 0; } -- Gitee From 645863d38354351b3f14ac09fe76bacecb520f2a Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Tue, 27 May 2025 13:46:31 +0800 Subject: [PATCH 4/5] anolis: futex: enable per process futex hash table for private futex ANBZ: #21547 Current futex hash table doesn't allocate per numa node table. Depends on what the futex hash key is, it's possible that futex hash table access hit numa node with far memory. We observed the bad futex performance because of cross numa node hash table access on an arm64 based server. To minize the impact of cross numa node futex hash table, we have per numa node futex hash table allocated. At the first time one process/thread access the futex, use the numa node id where the process/thread is scheduled on as futex hash table node index. All later futex access will use the same numa node. This can avoid cross node futex hash table access in: If user binds process to numa node with near memory access by using numactl/taskset/cpuset. If there is no process binding, the cross node futex hash table could happen. But the change doesn't make this case worse. There is one scenario that things may be worse: First, the process was scheduled on numa node with far memory and access futex. Then use calls sched_setaffinity to set CPU affinity to numa node with near memory. Handling this scenario needs complicated code change. In real life, we can use numactl/taskset to bind the process to numa node with near memory to workaround the problem. For performance testing: Use an arm64 server (128 cores with 2 numa node) as test platform. Running "perf bench futex all" w/o the patch and the result is as following: numactl -m 0 -N 0 perf bench futex all(64 threads on 64 cores of numa node 0): +----------------------+------------------+------------------+------+ | | base | w/ patch | gain | +----------------------+------------------+------------------+------+ | futex/hash | 2446627(+/-0.10%)| 3295807(+/-0.03%)| +34% | +----------------------+------------------+------------------+------+ | futex/wake | 0.0535 (+/-2.60%)| 0.0539 (+/-5%) | -0.7%| +----------------------+------------------+------------------+------+ | futex/wake-parallel | 0.0041 (+/-5%) | 0.0039 (+/-20%) | +4.9%| +----------------------+------------------+------------------+------+ | futex/requeue | 0.0274 (+/-4%) | 0.0272 (+/-0.9%) | +0.7%| +----------------------+------------------+------------------+------+ | futex/lock-pi | 276 (+/-0%) | 275 (+/-0.9%) | -0.4%| +----------------------+------------------+------------------+------+ perf bench futex all(128 threads on 128 cores as performance regression test): +----------------------+------------------+------------------+------+ | | base | w/ patch | gain | +----------------------+------------------+------------------+------+ | futex/hash | 2394156(+/-0.10%)| 2516767(+/-0.04%)| +4.8%| +----------------------+------------------+------------------+------+ | futex/wake | 0.1716 (+/-2%) | 0.1735 (+/-2%) | -1.1%| +----------------------+------------------+------------------+------+ | futex/wake-parallel | 0.0073 (+/-38%) | 0.0062 (+/-42%) | +15% | +----------------------+------------------+------------------+------+ | futex/requeue | 0.0707 (+/-10%) | 0.0693 (+/-3%) | +2% | +----------------------+------------------+------------------+------+ | futex/lock-pi | 276 (+/-0%) | 275 (+/-0.9%) | -0.3%| +----------------------+------------------+------------------+------+ NOTE: The BIAS of futex/wake-parallel is larger than the performance result difference. We can ignore the test reuslt of futex/wake-parallel. In general, the perfermance gain is obvious. There is very minor performance drop for some cases which can be ignored. Also tried lmbench and Unixbench, no regressions. Signed-off-by: Yin Fengwei Reviewed-by: Huang Ying --- fs/exec.c | 1 + include/linux/futex.h | 1 + include/linux/sched.h | 1 + kernel/futex/core.c | 15 ++++++++++++++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index ee71a315cc51..53121a6edd10 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1484,6 +1484,7 @@ void finalize_exec(struct linux_binprm *bprm) /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + current->futex_nid = NUMA_NO_NODE; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); diff --git a/include/linux/futex.h b/include/linux/futex.h index b70df27d7e85..5cdcfe4b4ab9 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -69,6 +69,7 @@ static inline void futex_init_task(struct task_struct *tsk) tsk->pi_state_cache = NULL; tsk->futex_state = FUTEX_STATE_OK; mutex_init(&tsk->futex_exit_mutex); + tsk->futex_nid = NUMA_NO_NODE; } void futex_exit_recursive(struct task_struct *tsk); diff --git a/include/linux/sched.h b/include/linux/sched.h index 217a58817641..e1d86abf2fac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1328,6 +1328,7 @@ struct task_struct { struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; + unsigned int futex_nid; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp; diff --git a/kernel/futex/core.c b/kernel/futex/core.c index c1cda5e97d78..a6f91bafff9f 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -115,10 +115,17 @@ late_initcall(fail_futex_debugfs); */ struct futex_hash_bucket *futex_hash(union futex_key *key) { + int idx; + + if (key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE)) + idx = MAX_NUMNODES; + else + idx = READ_ONCE(current->group_leader->futex_nid); + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); - return &futex_queues[0][hash & (futex_hashsize - 1)]; + return &futex_queues[idx][hash & (futex_hashsize - 1)]; } @@ -241,6 +248,12 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, if (unlikely(should_fail_futex(fshared))) return -EFAULT; + if (READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) { + int id = numa_node_id(); + + cmpxchg(¤t->group_leader->futex_nid, NUMA_NO_NODE, id); + } + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs -- Gitee From 63ee59ec02caf0b36b0e9e8c0fa93a588fe8cb32 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Fri, 6 Jun 2025 17:19:53 +0800 Subject: [PATCH 5/5] anolis: futex: Allow disbling per numa node futex from kernel cmdline ANBZ: #21547 Use: per_numa_node_futex=disable/enable in kernel command line to disable/enable per numa node futex. If it's disabled, Kernel will keep use original unified futex hash table. Signed-off-by: Yin Fengwei Reviewed-by: Huang Ying --- kernel/futex/core.c | 58 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index a6f91bafff9f..e64914510c96 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -106,6 +106,31 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ +static bool per_numa_node_futex = true; +static int __init setup_per_numa_node_futex(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + per_numa_node_futex = true; + ret = 1; + } else if (!strcmp(str, "disable")) { + per_numa_node_futex = false; + ret = 1; + } + +out: + if (!ret) + pr_warn("Unable to parse per_numa_node_futex=\n"); + + return ret; +} +__setup("per_numa_node_futex=", setup_per_numa_node_futex); + + /** * futex_hash - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated @@ -117,7 +142,8 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) { int idx; - if (key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE)) + if (!per_numa_node_futex || + (key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE))) idx = MAX_NUMNODES; else idx = READ_ONCE(current->group_leader->futex_nid); @@ -248,7 +274,8 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, if (unlikely(should_fail_futex(fshared))) return -EFAULT; - if (READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) { + if (per_numa_node_futex && + READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) { int id = numa_node_id(); cmpxchg(¤t->group_leader->futex_nid, NUMA_NO_NODE, id); @@ -1172,20 +1199,27 @@ static int __init futex_init(void) #if CONFIG_BASE_SMALL futex_hashsize = 16; #else - futex_hashsize = 256 * num_possible_cpus(); - futex_hashsize /= num_possible_nodes(); - /* 32 is larger than 16 and not that too much */ - futex_hashsize = max(32, futex_hashsize); - futex_hashsize = roundup_pow_of_two(futex_hashsize); + if (per_numa_node_futex) { + futex_hashsize = 256 * num_possible_cpus(); + futex_hashsize /= num_possible_nodes(); + /* 32 is larger than 16 and not that too much */ + futex_hashsize = max(32, futex_hashsize); + futex_hashsize = roundup_pow_of_two(futex_hashsize); + } else { + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + } #endif - for_each_node(nid) - futex_queues[nid] = alloc_futex_hash("private futex", - nid, futex_hashsize); + if (per_numa_node_futex) { + for_each_node(nid) + futex_queues[nid] = alloc_futex_hash("private futex", + nid, futex_hashsize); + } /* - * For shared futex, it could be accessed from different processes. - * Can not use per-process index for futex hash. Use global hash table. + * For shared futex or if per numa node futex is disabled, futex hash + * table could be accessed from different processes.Can not use + * per-process index for futex hash. Use global hash table instead. */ futex_queues[MAX_NUMNODES] = alloc_futex_hash("shared futex", NUMA_NO_NODE, futex_hashsize); -- Gitee