diff --git a/fs/exec.c b/fs/exec.c
index ee71a315cc51f51a066a552614fd10bdecd32188..53121a6edd10d50fde36d5f94d8d9b6ea0385b21 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1484,6 +1484,7 @@ void finalize_exec(struct linux_binprm *bprm)
 	/* Store any stack rlimit changes before starting thread. */
 	task_lock(current->group_leader);
 	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
+	current->futex_nid = NUMA_NO_NODE;
 	task_unlock(current->group_leader);
 }
 EXPORT_SYMBOL(finalize_exec);
diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85c2c257c3467780d7b835b58d9fe1..5cdcfe4b4ab93ec04653601b38c0e18260e58daf 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -69,6 +69,7 @@ static inline void futex_init_task(struct task_struct *tsk)
 	tsk->pi_state_cache = NULL;
 	tsk->futex_state = FUTEX_STATE_OK;
 	mutex_init(&tsk->futex_exit_mutex);
+	tsk->futex_nid = NUMA_NO_NODE;
 }
 
 void futex_exit_recursive(struct task_struct *tsk);
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ed64240041e85775ab0f6e0a3a5055d106523657..004243557ddfb4812c373dc9752517d1561e7505 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -423,18 +423,27 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
 			     phys_addr_t min_addr, phys_addr_t max_addr,
 			     int nid);
 
-static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
+static __always_inline void *memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
 	return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
-				      MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
+				      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+}
+
+static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
+{
+	return memblock_alloc_nid(size, align, NUMA_NO_NODE);
+}
+
+static inline void *memblock_alloc_raw_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+	return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
+					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
 static inline void *memblock_alloc_raw(phys_addr_t size,
 					       phys_addr_t align)
 {
-	return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
-					  MEMBLOCK_ALLOC_ACCESSIBLE,
-					  NUMA_NO_NODE);
+	return memblock_alloc_raw_nid(size, align, NUMA_NO_NODE);
 }
 
 static inline void *memblock_alloc_from(phys_addr_t size,
@@ -573,7 +582,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 	     region < (memblock.reserved.regions + memblock.reserved.cnt); \
 	     region++)
 
-extern void *alloc_large_system_hash(const char *tablename,
+extern void *alloc_large_system_hash_nid(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
@@ -581,7 +590,32 @@ extern void *alloc_large_system_hash(const char *tablename,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
-				     unsigned long high_limit);
+				     unsigned long high_limit,
+				     int nid);
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+static __always_inline void *alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int flags,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask,
+				     unsigned long low_limit,
+				     unsigned long high_limit)
+{
+	return alloc_large_system_hash_nid(tablename, bucketsize,
+					   numentries, scale, flags,
+					   _hash_shift, _hash_mask,
+					   low_limit, high_limit,
+					   NUMA_NO_NODE);
+}
+
 
 #define HASH_EARLY	0x00000001	/* Allocating during early boot? */
 #define HASH_ZERO	0x00000002	/* Zero allocated hash table */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 217a58817641fec6215086a68f3387fdebe2ef30..e1d86abf2facc9d42c2f3931c32b0a1feea21027 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1328,6 +1328,7 @@ struct task_struct {
 	struct futex_pi_state		*pi_state_cache;
 	struct mutex			futex_exit_mutex;
 	unsigned int			futex_state;
+	unsigned int			futex_nid;
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	struct perf_event_context	*perf_event_ctxp;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8ddde9cc947c685e64923139c66c3f9..f1afe4d353913a596a7405ea4ba4f6c38384f6cd 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -152,6 +152,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
 void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
 extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index f30a93e50f65e8c80a95b398dc5c2b563fab14fc..e64914510c9609bfca293a845d23f455b3bae219 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -47,7 +47,8 @@
  * reside in the same cacheline.
  */
 static struct {
-	struct futex_hash_bucket *queues;
+	/* one more is for shared futex hash table queue */
+	struct futex_hash_bucket *queues[MAX_NUMNODES + 1];
 	unsigned long            hashsize;
 } __futex_data __read_mostly __aligned(2*sizeof(long));
 #define futex_queues   (__futex_data.queues)
@@ -105,6 +106,31 @@ late_initcall(fail_futex_debugfs);
 
 #endif /* CONFIG_FAIL_FUTEX */
 
+static bool per_numa_node_futex = true;
+static int __init setup_per_numa_node_futex(char *str)
+{
+	int ret = 0;
+
+	if (!str)
+		goto out;
+
+	if (!strcmp(str, "enable")) {
+		per_numa_node_futex = true;
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		per_numa_node_futex = false;
+		ret = 1;
+	}
+
+out:
+	if (!ret)
+		pr_warn("Unable to parse per_numa_node_futex=\n");
+
+	return ret;
+}
+__setup("per_numa_node_futex=", setup_per_numa_node_futex);
+
+
 /**
  * futex_hash - Return the hash bucket in the global hash
  * @key:	Pointer to the futex key for which the hash is calculated
@@ -114,10 +140,18 @@ late_initcall(fail_futex_debugfs);
  */
 struct futex_hash_bucket *futex_hash(union futex_key *key)
 {
+	int idx;
+
+	if (!per_numa_node_futex ||
+			(key->both.offset & (FUT_OFF_MMSHARED | FUT_OFF_INODE)))
+		idx = MAX_NUMNODES;
+	else
+		idx = READ_ONCE(current->group_leader->futex_nid);
+
 	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
 			  key->both.offset);
 
-	return &futex_queues[hash & (futex_hashsize - 1)];
+	return &futex_queues[idx][hash & (futex_hashsize - 1)];
 }
 
 
@@ -240,6 +274,13 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 	if (unlikely(should_fail_futex(fshared)))
 		return -EFAULT;
 
+	if (per_numa_node_futex &&
+			READ_ONCE(current->group_leader->futex_nid) == NUMA_NO_NODE) {
+		int id = numa_node_id();
+
+		cmpxchg(&current->group_leader->futex_nid, NUMA_NO_NODE, id);
+	}
+
 	/*
 	 * PROCESS_PRIVATE futexes are fast.
 	 * As the mm cannot disappear under us and the 'key' only needs
@@ -1130,29 +1171,59 @@ void futex_exit_release(struct task_struct *tsk)
 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }
 
+static struct futex_hash_bucket* __init
+alloc_futex_hash(const char *tablename, int nid, int hash_size)
+{
+	struct futex_hash_bucket *fhb;
+	unsigned int shift;
+
+	fhb = alloc_large_system_hash_nid(tablename,
+					  sizeof(struct futex_hash_bucket),
+					  hash_size, 0, 0, &shift, NULL,
+					  hash_size, hash_size, nid);
+
+	hash_size = 1UL << shift;
+	for (int i = 0; i < hash_size; i++) {
+		atomic_set(&fhb[i].waiters, 0);
+		plist_head_init(&fhb[i].chain);
+		spin_lock_init(&fhb[i].lock);
+	}
+
+	return fhb;
+}
+
 static int __init futex_init(void)
 {
-	unsigned int futex_shift;
-	unsigned long i;
+	unsigned int nid;
 
 #if CONFIG_BASE_SMALL
 	futex_hashsize = 16;
 #else
-	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+	if (per_numa_node_futex) {
+		futex_hashsize = 256 * num_possible_cpus();
+		futex_hashsize /= num_possible_nodes();
+		/* 32 is larger than 16 and not that too much */
+		futex_hashsize = max(32, futex_hashsize);
+		futex_hashsize = roundup_pow_of_two(futex_hashsize);
+	} else {
+		futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+	}
 #endif
 
-	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
-					       futex_hashsize, 0, 0,
-					       &futex_shift, NULL,
-					       futex_hashsize, futex_hashsize);
-	futex_hashsize = 1UL << futex_shift;
-
-	for (i = 0; i < futex_hashsize; i++) {
-		atomic_set(&futex_queues[i].waiters, 0);
-		plist_head_init(&futex_queues[i].chain);
-		spin_lock_init(&futex_queues[i].lock);
+	if (per_numa_node_futex) {
+		for_each_node(nid)
+			futex_queues[nid] = alloc_futex_hash("private futex",
+							     nid, futex_hashsize);
 	}
 
+	/*
+	 * For shared futex or if per numa node futex is disabled, futex hash
+	 * table could be accessed from different processes.Can not use
+	 * per-process index for futex hash. Use global hash table instead.
+	 */
+	futex_queues[MAX_NUMNODES] = alloc_futex_hash("shared futex", NUMA_NO_NODE,
+						      futex_hashsize);
+
 	return 0;
 }
 core_initcall(futex_init);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 77fd04c83d046db29912b7f3afed3339a067318d..955d9d7d8cade08527a1ff5536a919ace11af89e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2449,12 +2449,12 @@ static unsigned long __init arch_reserved_kernel_pages(void)
 #endif
 
 /*
- * allocate a large system hash table from bootmem
+ * allocate a large system hash table from bootmem on specific numa node
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
-void *__init alloc_large_system_hash(const char *tablename,
+void *__init alloc_large_system_hash_nid(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
@@ -2462,7 +2462,8 @@ void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
-				     unsigned long high_limit)
+				     unsigned long high_limit,
+				     int nid)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
@@ -2522,12 +2523,11 @@ void *__init alloc_large_system_hash(const char *tablename,
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY) {
 			if (flags & HASH_ZERO)
-				table = memblock_alloc(size, SMP_CACHE_BYTES);
+				table = memblock_alloc_nid(size, SMP_CACHE_BYTES, nid);
 			else
-				table = memblock_alloc_raw(size,
-							   SMP_CACHE_BYTES);
+				table = memblock_alloc_raw_nid(size, SMP_CACHE_BYTES, nid);
 		} else if (get_order(size) > MAX_ORDER || hashdist) {
-			table = vmalloc_huge(size, gfp_flags);
+			table = vmalloc_huge_node(size, gfp_flags, nid);
 			virt = true;
 			if (table)
 				huge = is_vm_area_hugepages(table);
@@ -2537,7 +2537,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
-			table = alloc_pages_exact(size, gfp_flags);
+			table = alloc_pages_exact_nid(nid, size, gfp_flags);
 			kmemleak_alloc(table, size, 1, gfp_flags);
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb947787f25da2af38ea353257f833fa1227dd93..da8de24d121e7db8767f049bb06152ece3d83ab8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3451,6 +3451,14 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(vmalloc_huge);
 
+void *vmalloc_huge_node(unsigned long size, gfp_t gfp_mask, int node)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+				    node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(vmalloc_huge_node);
+
 /**
  * vzalloc - allocate virtually contiguous memory with zero fill
  * @size:    allocation size