diff --git a/9000-Support-initializing-HBW-nodes-from-memory_locality.patch b/9000-Support-initializing-HBW-nodes-from-memory_locality.patch new file mode 100644 index 0000000000000000000000000000000000000000..4df39fcc69f222f2522e12e020f13d5fd7e44f7f --- /dev/null +++ b/9000-Support-initializing-HBW-nodes-from-memory_locality.patch @@ -0,0 +1,340 @@ +From 448eb95b45b0cf6ecc7cf1a3e24056a2fdae85bd Mon Sep 17 00:00:00 2001 +From: Yicong Yang +Date: Fri, 13 Oct 2023 15:21:11 +0800 +Subject: [PATCH] Support initializing HBW nodes from memory_locality + +In current implementation we mainly infer the HBW nodes from the +HMAT/SLIT, which may not describe all the cases. For example +the HMAT/SLIT cannot describe the topology below: + +[ Node 0 ] +[ CPU 0-3 ][ CPU 4-7 ] + | | +[ HBM 0 ][ HBM 1 ] +[ Node 1 ][ Node 2 ] + +CPU 0-7 are in one NUMA node, but CPU 0-3 is closest to HBM 0 while +CPU 4-7 is closest to HBM 1. Current HMAT/SLIT cannot support this +case. + +In order to support this, openeuler has merged a HBM device driver +to export the topology by sysfs[1]. The description of above topology +will be like: +$ cat /sys/kernel/hbm_memory/memory_topo/memory_locality +1 0-3 +1 4-7 + +This patch cooperate with the HBM device driver to support initializing +the HBW nodes from memory_locality for memkind. Will try to obtains +the HBW nodes by parsing the memory_locality first, on failure or there +is no memory_locality on the system will fallback to HMAT/SLIT. User +can disable this function by MEMKIND_DISABLE_MEMORY_LOCALITY=1 as well. + +[1] https://gitee.com/openeuler/kernel/pulls/451 +Signed-off-by: Yicong Yang +--- + include/memkind/internal/memkind_bitmask.h | 2 + + src/memkind_bitmask.c | 185 +++++++++++++++++++++ + src/memkind_hbw.c | 42 +++++ + 3 files changed, 229 insertions(+) + +diff --git a/include/memkind/internal/memkind_bitmask.h b/include/memkind/internal/memkind_bitmask.h +index 5c5b8434..6b0c3f64 100644 +--- a/include/memkind/internal/memkind_bitmask.h ++++ b/include/memkind/internal/memkind_bitmask.h +@@ -12,6 +12,8 @@ extern "C" { + + typedef int (*get_node_bitmask)(struct bitmask **); + ++int set_numanode_from_memory_locality(void **numanode, ++ memkind_node_variant_t node_variant); + int set_closest_numanode(get_node_bitmask get_bitmask, void **numanode, + memkind_node_variant_t node_variant); + int set_bitmask_for_current_numanode(unsigned long *nodemask, +diff --git a/src/memkind_bitmask.c b/src/memkind_bitmask.c +index 4f6d9f00..84300395 100644 +--- a/src/memkind_bitmask.c ++++ b/src/memkind_bitmask.c +@@ -1,9 +1,11 @@ + // SPDX-License-Identifier: BSD-2-Clause + /* Copyright (C) 2019 - 2021 Intel Corporation. */ + ++#include + #include + #include + #include ++#include + + #include + #include +@@ -12,6 +14,89 @@ + // Vector of CPUs with memory NUMA Node id(s) + VEC(vec_cpu_node, int); + ++void init_node_closet_cpu(cpu_set_t **cpunode_mask, int num_cpu, int num_nodes) ++{ ++ char *line = NULL; ++ size_t len = 0; ++ ssize_t n; ++ FILE *f; ++ ++ /* ++ * The content of /sys/kernel/hbm_memory/memory_topo/memory_locality should ++ * be like: ++ * 2 0-3 ++ * 3 4-7 ++ * 4 8-11 ++ * 5 12-15 ++ * 6 16-19 ++ * 7 20-23 ++ * 8 24-27 ++ * 9 28-31 ++ * ++ * The 1st column is the HBW node number and the 2nd column is the CPU list ++ * which is closet to the HBW node. ++ */ ++ f = fopen("/sys/kernel/hbm_memory/memory_topo/memory_locality", "r"); ++ if (!f) ++ return; ++ ++ while ((n = getline(&line, &len, f)) != -1) { ++ long int node, begin_cpu, end_cpu; ++ char *begin, *end; ++ ++ /* Get the node number first */ ++ node = strtol(line, &end, 0); ++ ++ /* Either the node number is invalid or the whole line is invalid */ ++ if (line == end || node == LONG_MAX || node == LONG_MIN) ++ break; ++ ++ if (node >= num_nodes) { ++ log_err("Invalid node number provided by memory_locality."); ++ break; ++ } ++ ++ /* Try to find the beginning of the CPU list string */ ++ while (*end == ' ' && end != line + len) ++ end++; ++ ++ if (end == line + len || !isdigit(*end)) ++ break; ++ ++ begin = end; ++ do { ++ begin_cpu = strtol(begin, &end, 0); ++ if (begin == end || begin_cpu == LONG_MAX || begin_cpu == LONG_MIN) ++ break; ++ ++ /* End of the line */ ++ if (*end == '\0' || *end == '\n') { ++ CPU_SET_S(begin_cpu, CPU_ALLOC_SIZE(num_cpu), cpunode_mask[node]); ++ break; ++ } else if (*end == ',') { ++ CPU_SET_S(begin_cpu, CPU_ALLOC_SIZE(num_cpu), cpunode_mask[node]); ++ } else if (*end == '-' && isdigit(*(++end))) { ++ begin = end; ++ end_cpu = strtol(begin, &end, 0); ++ if (begin == end || end_cpu == LONG_MAX || end_cpu == LONG_MIN) ++ break; ++ ++ while (begin_cpu <= end_cpu) { ++ CPU_SET_S(begin_cpu, CPU_ALLOC_SIZE(num_cpu), cpunode_mask[node]); ++ ++begin_cpu; ++ } ++ } else { ++ break; ++ } ++ ++ begin = end + 1; ++ } while (begin < line + len); ++ } ++ ++ free(line); ++ fclose(f); ++} ++ + int memkind_env_get_nodemask(char *nodes_env, struct bitmask **bm) + { + *bm = numa_parse_nodestring(nodes_env); +@@ -22,6 +107,106 @@ int memkind_env_get_nodemask(char *nodes_env, struct bitmask **bm) + return MEMKIND_SUCCESS; + } + ++int set_numanode_from_memory_locality(void **numanode, ++ memkind_node_variant_t node_variant) ++{ ++ int num_cpu = numa_num_configured_cpus(); ++ int cpuset_size = CPU_ALLOC_SIZE(num_cpu); ++ int max_node_id = numa_max_node(); ++ cpu_set_t **cpunode_mask; ++ int init_node, cpu_id; ++ int status; ++ ++ cpunode_mask = calloc(max_node_id + 1, sizeof(*cpunode_mask)); ++ if (!cpunode_mask) { ++ status = MEMKIND_ERROR_MALLOC; ++ log_err("calloc() failed."); ++ goto out; ++ } ++ ++ for (init_node = 0; init_node <= max_node_id; init_node++) { ++ cpunode_mask[init_node] = CPU_ALLOC(num_cpu); ++ if (!cpunode_mask[init_node]) { ++ while (init_node >= 0) { ++ CPU_FREE(cpunode_mask[init_node]); ++ init_node--; ++ } ++ ++ status = MEMKIND_ERROR_MALLOC; ++ log_err("CPU_ALLOC_SIZE() failed."); ++ goto free_cpunode_mask; ++ } ++ ++ CPU_ZERO_S(cpuset_size, cpunode_mask[init_node]); ++ } ++ ++ init_node_closet_cpu(cpunode_mask, num_cpu, max_node_id + 1); ++ ++ struct vec_cpu_node *node_arr = ++ (struct vec_cpu_node *)calloc(num_cpu, sizeof(struct vec_cpu_node)); ++ if (!node_arr) { ++ status = MEMKIND_ERROR_MALLOC; ++ log_err("calloc() failed."); ++ goto free_cpunode_mask_array; ++ } ++ ++ /* Scan CPUs once. Assuming the CPU number are much more bigger than NUMA Nodes */ ++ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) { ++ for (init_node = 0; init_node <= max_node_id; init_node++) { ++ if (CPU_ISSET_S(cpu_id, cpuset_size, cpunode_mask[init_node])) { ++ VEC_PUSH_BACK(&node_arr[cpu_id], init_node); ++ ++ /* ++ * A cpu should always have one closet node, log error if ++ * violate this. ++ */ ++ if (node_variant == NODE_VARIANT_SINGLE && ++ VEC_SIZE(&node_arr[cpu_id]) > 1) { ++ log_err("CPU%d has more than one closet node.", cpu_id); ++ status = MEMKIND_ERROR_RUNTIME; ++ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) { ++ if (VEC_CAPACITY(&node_arr[cpu_id])) ++ VEC_DELETE(&node_arr[cpu_id]); ++ } ++ ++ goto free_node_arr; ++ } ++ } ++ } ++ } ++ ++ /* Sanity Check each node_arr */ ++ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) { ++ if (VEC_SIZE(&node_arr[cpu_id]) == 0) { ++ log_err("CPU%d's nodemask is not initialized.", cpu_id); ++ status = MEMKIND_ERROR_RUNTIME; ++ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) { ++ if (VEC_CAPACITY(&node_arr[cpu_id])) ++ VEC_DELETE(&node_arr[cpu_id]); ++ } ++ ++ goto free_node_arr; ++ } ++ } ++ ++ *numanode = node_arr; ++ status = MEMKIND_SUCCESS; ++ goto free_cpunode_mask_array; ++ ++free_node_arr: ++ free(node_arr); ++ ++free_cpunode_mask_array: ++ for (init_node = 0; init_node <= max_node_id; init_node++) ++ CPU_FREE(cpunode_mask[init_node]); ++ ++free_cpunode_mask: ++ free(cpunode_mask); ++ ++out: ++ return status; ++} ++ + int set_closest_numanode(get_node_bitmask get_bitmask, void **numanode, + memkind_node_variant_t node_variant) + { +diff --git a/src/memkind_hbw.c b/src/memkind_hbw.c +index 077660ab..e9948593 100644 +--- a/src/memkind_hbw.c ++++ b/src/memkind_hbw.c +@@ -363,10 +363,36 @@ static bool is_hmat_supported(void) + return true; + } + ++/* ++ * OS may provide further information of HBW topology in ++ * /sys/kernel/hbm_memory/memory_topo/memory_locality. Use it unless user ++ * specified HBW nodes or disabled using of memory_locality. ++ */ ++static bool use_memory_locality(void) ++{ ++ char *memory_locality_disable = memkind_get_env("MEMKIND_DISABLE_MEMORY_LOCALITY"); ++ ++ if (memory_locality_disable && !strncmp(memory_locality_disable, "1", 1)) ++ return false; ++ ++ if (memkind_get_env("MEMKIND_HBW_NODES")) ++ return false; ++ ++ return true; ++} ++ + static void memkind_hbw_closest_numanode_init(void) + { + struct hbw_numanode_t *g = &memkind_hbw_numanode_g[NODE_VARIANT_MULTIPLE]; + g->numanode = NULL; ++ ++ if (use_memory_locality()) { ++ g->init_err = set_numanode_from_memory_locality(&g->numanode, ++ NODE_VARIANT_MULTIPLE); ++ if (!g->init_err) ++ return; ++ } ++ + if (!is_hmat_supported()) { + g->init_err = set_closest_numanode(memkind_hbw_get_nodemask, + &g->numanode, NODE_VARIANT_MULTIPLE); +@@ -380,6 +406,14 @@ static void memkind_hbw_closest_preferred_numanode_init(void) + { + struct hbw_numanode_t *g = &memkind_hbw_numanode_g[NODE_VARIANT_SINGLE]; + g->numanode = NULL; ++ ++ if (use_memory_locality()) { ++ g->init_err = set_numanode_from_memory_locality(&g->numanode, ++ NODE_VARIANT_SINGLE); ++ if (!g->init_err) ++ return; ++ } ++ + if (!is_hmat_supported()) { + g->init_err = set_closest_numanode(memkind_hbw_get_nodemask, + &g->numanode, NODE_VARIANT_SINGLE); +@@ -393,6 +427,14 @@ static void memkind_hbw_all_numanode_init(void) + { + struct hbw_numanode_t *g = &memkind_hbw_numanode_g[NODE_VARIANT_ALL]; + g->numanode = NULL; ++ ++ if (use_memory_locality()) { ++ g->init_err = set_numanode_from_memory_locality(&g->numanode, ++ NODE_VARIANT_ALL); ++ if (!g->init_err) ++ return; ++ } ++ + if (!is_hmat_supported()) { + g->init_err = set_closest_numanode(memkind_hbw_get_nodemask, + &g->numanode, NODE_VARIANT_ALL); +-- +2.24.0 + diff --git a/memkind.spec b/memkind.spec index f00a6bd42a383aa9142732c2b500b4c558155569..f3749e266dc16833f94702662c3a248c1ce636d4 100644 --- a/memkind.spec +++ b/memkind.spec @@ -1,13 +1,14 @@ Name: memkind Summary: Extensible Heap Manager for User Version: 1.13.0 -Release: 2 +Release: 3 License: BSD URL: http://memkind.github.io/memkind Source0: https://github.com/memkind/memkind/archive/v1.13.0/%{name}-%{version}.tar.gz Patch1000: 1000-add-loongarch-support-upstream.patch Patch1001: 1001-add-sw_64-support-not-upstream.patch +Patch9000: 9000-Support-initializing-HBW-nodes-from-memory_locality.patch BuildRequires: automake libtool numactl-devel systemd gcc gcc-c++ ExclusiveArch: x86_64 aarch64 loongarch64 sw_64 @@ -87,6 +88,9 @@ popd %{_mandir}/man7/* %changelog +* Wed Oct 18 2023 Yicong Yang - 1.13.0-3 +- Support initializing HBW nodes from memory_locality on openEuler + * Tue Aug 29 2023 herengui - 1.13.0-2 - add support for loongarch64 and sw_64