From 0828f422c01eed486d8200029aa4d63733b6a83e Mon Sep 17 00:00:00 2001
From: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Date: Tue, 27 May 2025 09:55:20 +0800
Subject: [PATCH 1/5] anolis: mm: add _node version for several memory
 allocation APIs

ANBZ: #21547

To enable node specific futex hash-tables memory allocations.

Signed-off-by: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
---
 include/linux/memblock.h | 19 ++++++++++++++-----
 include/linux/vmalloc.h  |  1 +
 mm/vmalloc.c             |  8 ++++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ed64240041e8..ad0d90ecb457 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -423,18 +423,27 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
 			     phys_addr_t min_addr, phys_addr_t max_addr,
 			     int nid);
 
-static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
+static __always_inline void *memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
 	return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
-				      MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
+				      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+}
+
+static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
+{
+	return memblock_alloc_nid(size, align, NUMA_NO_NODE);
+}
+
+static inline void *memblock_alloc_raw_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+	return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
+					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
 static inline void *memblock_alloc_raw(phys_addr_t size,
 					       phys_addr_t align)
 {
-	return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
-					  MEMBLOCK_ALLOC_ACCESSIBLE,
-					  NUMA_NO_NODE);
+	return memblock_alloc_raw_nid(size, align, NUMA_NO_NODE);
 }
 
 static inline void *memblock_alloc_from(phys_addr_t size,
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..f1afe4d35391 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -152,6 +152,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
 void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
 extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb947787f25d..da8de24d121e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3451,6 +3451,14 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(vmalloc_huge);
 
+void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+				    node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(vmalloc_huge_node);
+
 /**
  * vzalloc - allocate virtually contiguous memory with zero fill
  * @size:    allocation size
-- 
Gitee


From 331de053fda1218bfb2ac7245da217b3b675b64d Mon Sep 17 00:00:00 2001
From: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Date: Tue, 27 May 2025 10:50:46 +0800
Subject: [PATCH 2/5] anolis: mm/mm_init: add function to allocate large hash
 from numa node

ANBZ: #21547

To allow callers define specific numa node which memory will
come from.

Signed-off-by: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
---
 include/linux/memblock.h | 29 +++++++++++++++++++++++++++--
 mm/mm_init.c             | 16 ++++++++--------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ad0d90ecb457..004243557ddf 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -582,7 +582,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 	     region < (memblock.reserved.regions + memblock.reserved.cnt); \
 	     region++)
 
-extern void *alloc_large_system_hash(const char *tablename,
+extern void *alloc_large_system_hash_nid(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
@@ -590,7 +590,32 @@ extern void *alloc_large_system_hash(const char *tablename,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
-				     unsigned long high_limit);
+				     unsigned long high_limit,
+				     int nid);
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+static __always_inline void *alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int flags,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask,
+				     unsigned long low_limit,
+				     unsigned long high_limit)
+{
+	return alloc_large_system_hash_nid(tablename, bucketsize,
+					   numentries, scale, flags,
+					   _hash_shift, _hash_mask,
+					   low_limit, high_limit,
+					   NUMA_NO_NODE);
+}
+
 
 #define HASH_EARLY	0x00000001	/* Allocating during early boot? */
 #define HASH_ZERO	0x00000002	/* Zero allocated hash table */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 77fd04c83d04..955d9d7d8cad 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2449,12 +2449,12 @@ static unsigned long __init arch_reserved_kernel_pages(void)
 #endif
 
 /*
- * allocate a large system hash table from bootmem
+ * allocate a large system hash table from bootmem on specific numa node
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
-void *__init alloc_large_system_hash(const char *tablename,
+void *__init alloc_large_system_hash_nid(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
@@ -2462,7 +2462,8 @@ void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
-				     unsigned long high_limit)
+				     unsigned long high_limit,
+				     int nid)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
@@ -2522,12 +2523,11 @@ void *__init alloc_large_system_hash(const char *tablename,
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY) {
 			if (flags & HASH_ZERO)
-				table = memblock_alloc(size, SMP_CACHE_BYTES);
+				table = memblock_alloc_nid(size, SMP_CACHE_BYTES, nid);
 			else
-				table = memblock_alloc_raw(size,
-							   SMP_CACHE_BYTES);
+				table = memblock_alloc_raw_nid(size, SMP_CACHE_BYTES, nid);
 		} else if (get_order(size) > MAX_ORDER || hashdist) {
-			table = vmalloc_huge(size, gfp_flags);
+			table = vmalloc_huge_node(size, gfp_flags, nid);
 			virt = true;
 			if (table)
 				huge = is_vm_area_hugepages(table);
@@ -2537,7 +2537,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
-			table = alloc_pages_exact(size, gfp_flags);
+			table = alloc_pages_exact_nid(nid, size, gfp_flags);
 			kmemleak_alloc(table, size, 1, gfp_flags);
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
-- 
Gitee


From 67a36635d56180e0463580ca6fd04759395e1160 Mon Sep 17 00:00:00 2001
From: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Date: Tue, 27 May 2025 11:20:35 +0800
Subject: [PATCH 3/5] anolis: futex: extend futex hash table to allow per numa
 node hash table

ANBZ: #21547

Extend futex hash table to an array which will hold the pointer
to the memory from all possible numa nodes.

Allocate one more futex hash table for shared futex. As shared
futex can be accessed from different processes, we can not use
per process index for shared futex hash table. So just still
use a global futex hash table for it.

Signed-off-by: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
---
 kernel/futex/core.c | 54 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index f30a93e50f65..c1cda5e97d78 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -47,7 +47,8 @@
  * reside in the same cacheline.
  */
 static struct {
-	struct futex_hash_bucket *queues;
+	/* one more is for shared futex hash table queue */
+	struct futex_hash_bucket *queues[MAX_NUMNODES + 1];
 	unsigned long            hashsize;
 } __futex_data __read_mostly __aligned(2*sizeof(long));
 #define futex_queues   (__futex_data.queues)
@@ -117,7 +118,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
 	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
 			  key->both.offset);
 
-	return &futex_queues[hash & (futex_hashsize - 1)];
+	return &futex_queues[0][hash & (futex_hashsize - 1)];
 }
 
 
@@ -1130,28 +1131,51 @@ void futex_exit_release(struct task_struct *tsk)
 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }
 
+static struct futex_hash_bucket* __init
+alloc_futex_hash(const char *tablename, int nid, int hash_size)
+{
+	struct futex_hash_bucket *fhb;
+	unsigned int shift;
+
+	fhb = alloc_large_system_hash_nid(tablename,
+					  sizeof(struct futex_hash_bucket),
+					  hash_size, 0, 0, &shift, NULL,
+					  hash_size, hash_size, nid);
+
+	hash_size = 1UL << shift;
+	for (int i = 0; i < hash_size; i++) {
+		atomic_set(&fhb[i].waiters, 0);
+		plist_head_init(&fhb[i].chain);
+		spin_lock_init(&fhb[i].lock);
+	}
+
+	return fhb;
+}
+
 static int __init futex_init(void)
 {
-	unsigned int futex_shift;
-	unsigned long i;
+	unsigned int nid;
 
 #if CONFIG_BASE_SMALL
 	futex_hashsize = 16;
 #else
-	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+	futex_hashsize = 256 * num_possible_cpus();
+	futex_hashsize /= num_possible_nodes();
+	/* 32 is larger than 16 and not that too much */
+	futex_hashsize = max(32, futex_hashsize);
+	futex_hashsize = roundup_pow_of_two(futex_hashsize);
 #endif
 
-	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
-					       futex_hashsize, 0, 0,
-					       &futex_shift, NULL,
-					       futex_hashsize, futex_hashsize);
-	futex_hashsize = 1UL << futex_shift;
+	for_each_node(nid)
+		futex_queues[nid] = alloc_futex_hash("private futex",
+						     nid, futex_hashsize);
 
-	for (i = 0; i < futex_hashsize; i++) {
-		atomic_set(&futex_queues[i].waiters, 0);
-		plist_head_init(&futex_queues[i].chain);
-		spin_lock_init(&futex_queues[i].lock);
-	}
+	/*
+	 * For shared futex, it could be accessed from different processes.
+	 * Can not use per-process index for futex hash. Use global hash table.
+	 */
+	futex_queues[MAX_NUMNODES] = alloc_futex_hash("shared futex", NUMA_NO_NODE,
+						      futex_hashsize);
 
 	return 0;
 }
-- 
Gitee


From 645863d38354351b3f14ac09fe76bacecb520f2a Mon Sep 17 00:00:00 2001
From: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Date: Tue, 27 May 2025 13:46:31 +0800
Subject: [PATCH 4/5] anolis: futex: enable per process futex hash table for
 private futex

ANBZ: #21547

Current futex hash table doesn't allocate per numa node table.

Depends on what the futex hash key is, it's possible that
futex hash table access hit numa node with far memory.

We observed the bad futex performance because of cross numa
node hash table access on an arm64 based server.

To minize the impact of cross numa node futex hash table, we
have per numa node futex hash table allocated. At the first
time one process/thread access the futex, use the numa node id
where the process/thread is scheduled on as futex hash table
node index. All later futex access will use the same numa node.

This can avoid cross node futex hash table access in:
  If user binds process to numa node with near memory access by
  using numactl/taskset/cpuset.

If there is no process binding, the cross node futex hash table
could happen. But the change doesn't make this case worse.

There is one scenario that things may be worse:
  First, the process was scheduled on numa node with far memory
  and access futex. Then use calls sched_setaffinity to set
  CPU affinity to numa node with near memory.

  Handling this scenario needs complicated code change. In real
  life, we can use numactl/taskset to bind the process to numa
  node with near memory to workaround the problem.

For performance testing:

Use an arm64 server (128 cores with 2 numa node) as test
platform. Running "perf bench futex all" w/o the patch
and the result is as following:

numactl -m 0 -N 0 perf bench futex all(64 threads on 64 cores of numa node 0):
   +----------------------+------------------+------------------+------+
   |                      | base             | w/ patch         | gain |
   +----------------------+------------------+------------------+------+
   | futex/hash           | 2446627(+/-0.10%)| 3295807(+/-0.03%)| +34% |
   +----------------------+------------------+------------------+------+
   | futex/wake           | 0.0535 (+/-2.60%)| 0.0539 (+/-5%)   | -0.7%|
   +----------------------+------------------+------------------+------+
   | futex/wake-parallel  | 0.0041 (+/-5%)   | 0.0039 (+/-20%)  | +4.9%|
   +----------------------+------------------+------------------+------+
   | futex/requeue        | 0.0274 (+/-4%)   | 0.0272 (+/-0.9%) | +0.7%|
   +----------------------+------------------+------------------+------+
   | futex/lock-pi        | 276    (+/-0%)   | 275    (+/-0.9%) | -0.4%|
   +----------------------+------------------+------------------+------+

perf bench futex all(128 threads on 128 cores as performance regression test):
   +----------------------+------------------+------------------+------+
   |                      | base             | w/ patch         | gain |
   +----------------------+------------------+------------------+------+
   | futex/hash           | 2394156(+/-0.10%)| 2516767(+/-0.04%)| +4.8%|
   +----------------------+------------------+------------------+------+
   | futex/wake           | 0.1716 (+/-2%)   | 0.1735 (+/-2%)   | -1.1%|
   +----------------------+------------------+------------------+------+
   | futex/wake-parallel  | 0.0073 (+/-38%)  | 0.0062 (+/-42%)  | +15% |
   +----------------------+------------------+------------------+------+
   | futex/requeue        | 0.0707 (+/-10%)  | 0.0693 (+/-3%)   | +2%  |
   +----------------------+------------------+------------------+------+
   | futex/lock-pi        | 276    (+/-0%)   | 275    (+/-0.9%) | -0.3%|
   +----------------------+------------------+------------------+------+

NOTE: The BIAS of futex/wake-parallel is larger than the performance result
      difference. We can ignore the test reuslt of futex/wake-parallel.

In general, the perfermance gain is obvious. There is very minor
performance drop for some cases which can be ignored.

Also tried lmbench and Unixbench, no regressions.

Signed-off-by: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
---
 fs/exec.c             |  1 +
 include/linux/futex.h |  1 +
 include/linux/sched.h |  1 +
 kernel/futex/core.c   | 15 ++++++++++++++-
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index ee71a315cc51..53121a6edd10 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1484,6 +1484,7 @@ void finalize_exec(struct linux_binprm *bprm)
 	/* Store any stack rlimit changes before starting thread. */
 	task_lock(current->group_leader);
 	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
+	current->futex_nid = NUMA_NO_NODE;
 	task_unlock(current->group_leader);
 }
 EXPORT_SYMBOL(finalize_exec);
diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85..5cdcfe4b4ab9 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -69,6 +69,7 @@ static inline void futex_init_task(struct task_struct *tsk)
 	tsk->pi_state_cache = NULL;
 	tsk->futex_state = FUTEX_STATE_OK;
 	mutex_init(&tsk->futex_exit_mutex);
+	tsk->futex_nid = NUMA_NO_NODE;
 }
 
 void futex_exit_recursive(struct task_struct *tsk);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 217a58817641..e1d86abf2fac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1328,6 +1328,7 @@ struct task_struct {
 	struct futex_pi_state		*pi_state_cache;
 	struct mutex			futex_exit_mutex;
 	unsigned int			futex_state;
+	unsigned int			futex_nid;
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	struct perf_event_context	*perf_event_ctxp;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index c1cda5e97d78..a6f91bafff9f 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -115,10 +115,17 @@ late_initcall(fail_futex_debugfs);
  */
 struct futex_hash_bucket *futex_hash(union futex_key *key)
 {
+	int idx;
+
+	if (key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE))
+		idx = MAX_NUMNODES;
+	else
+		idx = READ_ONCE(current->group_leader->futex_nid);
+
 	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
 			  key->both.offset);
 
-	return &futex_queues[0][hash & (futex_hashsize - 1)];
+	return &futex_queues[idx][hash & (futex_hashsize - 1)];
 }
 
 
@@ -241,6 +248,12 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 	if (unlikely(should_fail_futex(fshared)))
 		return -EFAULT;
 
+	if (READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) {
+		int id = numa_node_id();
+
+		cmpxchg(&current->group_leader->futex_nid, NUMA_NO_NODE, id);
+	}
+
 	/*
 	 * PROCESS_PRIVATE futexes are fast.
 	 * As the mm cannot disappear under us and the 'key' only needs
-- 
Gitee


From 63ee59ec02caf0b36b0e9e8c0fa93a588fe8cb32 Mon Sep 17 00:00:00 2001
From: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Date: Fri, 6 Jun 2025 17:19:53 +0800
Subject: [PATCH 5/5] anolis: futex: Allow disbling per numa node futex from
 kernel cmdline

ANBZ: #21547

Use:
	per_numa_node_futex=disable/enable
in kernel command line to disable/enable per numa node futex. If it's
disabled, Kernel will keep use original unified futex hash table.

Signed-off-by: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
---
 kernel/futex/core.c | 58 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index a6f91bafff9f..e64914510c96 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -106,6 +106,31 @@ late_initcall(fail_futex_debugfs);
 
 #endif /* CONFIG_FAIL_FUTEX */
 
+static bool per_numa_node_futex = true;
+static int __init setup_per_numa_node_futex(char *str)
+{
+	int ret = 0;
+
+	if (!str)
+		goto out;
+
+	if (!strcmp(str, "enable")) {
+		per_numa_node_futex = true;
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		per_numa_node_futex = false;
+		ret = 1;
+	}
+
+out:
+	if (!ret)
+		pr_warn("Unable to parse per_numa_node_futex=\n");
+
+	return ret;
+}
+__setup("per_numa_node_futex=", setup_per_numa_node_futex);
+
+
 /**
  * futex_hash - Return the hash bucket in the global hash
  * @key:	Pointer to the futex key for which the hash is calculated
@@ -117,7 +142,8 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
 {
 	int idx;
 
-	if (key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE))
+	if (!per_numa_node_futex ||
+			(key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE)))
 		idx = MAX_NUMNODES;
 	else
 		idx = READ_ONCE(current->group_leader->futex_nid);
@@ -248,7 +274,8 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 	if (unlikely(should_fail_futex(fshared)))
 		return -EFAULT;
 
-	if (READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) {
+	if (per_numa_node_futex &&
+			READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) {
 		int id = numa_node_id();
 
 		cmpxchg(&current->group_leader->futex_nid, NUMA_NO_NODE, id);
@@ -1172,20 +1199,27 @@ static int __init futex_init(void)
 #if CONFIG_BASE_SMALL
 	futex_hashsize = 16;
 #else
-	futex_hashsize = 256 * num_possible_cpus();
-	futex_hashsize /= num_possible_nodes();
-	/* 32 is larger than 16 and not that too much */
-	futex_hashsize = max(32, futex_hashsize);
-	futex_hashsize = roundup_pow_of_two(futex_hashsize);
+	if (per_numa_node_futex) {
+		futex_hashsize = 256 * num_possible_cpus();
+		futex_hashsize /= num_possible_nodes();
+		/* 32 is larger than 16 and not that too much */
+		futex_hashsize = max(32, futex_hashsize);
+		futex_hashsize = roundup_pow_of_two(futex_hashsize);
+	} else {
+		futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+	}
 #endif
 
-	for_each_node(nid)
-		futex_queues[nid] = alloc_futex_hash("private futex",
-						     nid, futex_hashsize);
+	if (per_numa_node_futex) {
+		for_each_node(nid)
+			futex_queues[nid] = alloc_futex_hash("private futex",
+							     nid, futex_hashsize);
+	}
 
 	/*
-	 * For shared futex, it could be accessed from different processes.
-	 * Can not use per-process index for futex hash. Use global hash table.
+	 * For shared futex or if per numa node futex is disabled, futex hash
+	 * table could be accessed from different processes.Can not use
+	 * per-process index for futex hash. Use global hash table instead.
 	 */
 	futex_queues[MAX_NUMNODES] = alloc_futex_hash("shared futex", NUMA_NO_NODE,
 						      futex_hashsize);
-- 
Gitee