From b6e902b3d72febc2ddfeaa82c5b0ed3ff4f8340b Mon Sep 17 00:00:00 2001 From: Junhao He Date: Sat, 31 Aug 2024 18:42:46 +0800 Subject: [PATCH] Add HBM Memory ACLS support for HiSilicon When a hardware error occurs in a cell of the HBM memory, the internal SRAM of the memory controller is used to replace the faulty memory, this method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS RAS, and the rasdaemon record it and runs the ACLS to replace the faulty memory. HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can check which HBM cell the physical address belongs to and filter invalid HBM addresses. Multiple RAS errors are reported if memory errors occur in different HBM cells. The feature depends on the linux kernel CONFIG_HISI_MEM_RAS and CONFIG_PAGE_EJECT. Signed-off-by: Junhao He --- ...BM-Memory-ACLS-support-for-HiSilicon.patch | 298 ++++++++++++++++++ rasdaemon.spec | 9 +- 2 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch diff --git a/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch new file mode 100644 index 0000000..b66d92a --- /dev/null +++ b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch @@ -0,0 +1,298 @@ +From 6e5f83712ee2e7af1272b8064dd965d423b97ce2 Mon Sep 17 00:00:00 2001 +From: Junhao He +Date: Sat, 31 Aug 2024 17:52:02 +0800 +Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon + +When a hardware error occurs in a cell of the HBM memory, the internal +SRAM of the memory controller is used to replace the faulty memory, this +method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS +RAS, and the rasdaemon record it and runs the ACLS to replace the faulty +memory. + +HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can +check which HBM cell the physical address belongs to and filter invalid +HBM addresses. Multiple RAS errors are reported if memory errors occur +in different HBM cells. + +The feature depends on the linux kernel CONFIG_HISI_MEM_RAS and +CONFIG_PAGE_EJECT. + +Signed-off-by: Junhao He +--- + configure.ac | 11 +++ + misc/rasdaemon.env | 7 +- + non-standard-hisilicon.c | 196 +++++++++++++++++++++++++++++++++++++++ + 3 files changed, 213 insertions(+), 1 deletion(-) + +diff --git a/configure.ac b/configure.ac +index d098fcf..30c90d2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -171,6 +171,16 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x + AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) + ++AC_ARG_ENABLE([hisi_hbm_memory_acls], ++ AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS])) ++ ++AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS") ++ AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS]) ++]) ++AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -212,4 +222,5 @@ compile time options summary + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION ++ HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index ca12a1a..516c4ac 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -46,4 +46,9 @@ CPU_ISOLATION_CYCLE="24h" + CPU_ISOLATION_LIMIT="10" + + # Disable specified events by config +-DISABLE="block:block_rq_complete" +\ No newline at end of file ++DISABLE="block:block_rq_complete" ++ ++# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no). ++HISI_HBM_MEMORY_ACLS="no" ++# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no). ++HISI_HBM_ISOLATION_PAGE="no" +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 7296d28..2b176cd 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -19,6 +19,48 @@ + #define HISI_BUF_LEN 2048 + #define HISI_PCIE_INFO_BUF_LEN 256 + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++#include ++#include ++#include ++#include ++#include ++ ++#define HISI_HBM_MEM_RAS_NAME "HISI0521" ++#define HISI_HBM_UNKNOWN 0 ++#define HISI_HBM_HBM_MEMORY 1 ++#define HISI_HBM_DDR_MEMORY 2 ++ ++#define HISI_TYPE_UINT32_WIDTH 32 ++/* Specify the Hisilicon HBMC HBM repair requeset type */ ++#define HISI_HBM_REPAIR_REQ_TYPE 0 ++#define HISI_HBM_CE_ACLS BIT(0) ++#define HISI_HBM_ACLS_ADDL 1 ++#define HISI_HBM_ACLS_ADDH 2 ++#define HISI_HBM_ACLS_ARRAY_SIZE 12 ++#define HISI_HBMC_SUBMOD_HBM_REPAIR 6 ++ ++static bool hisi_hbm_acls_en; ++static bool hisi_hbm_isolation_page_en; ++ ++static void hisi_hbm_param_init(void) ++{ ++ char *env; ++ ++ env = getenv("HISI_HBM_MEMORY_ACLS"); ++ if (env && strcasecmp(env, "yes") == 0) { ++ log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n"); ++ hisi_hbm_acls_en = true; ++ } ++ ++ env = getenv("HISI_HBM_ISOLATION_PAGE"); ++ if (env && strcasecmp(env, "yes") == 0) { ++ log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n"); ++ hisi_hbm_isolation_page_en = true; ++ } ++} ++#endif ++ + struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; +@@ -358,6 +400,151 @@ static int add_hisi_common_table(struct ras_events *ras, + return 0; + } + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++static int write_file(char *path, const char *name, unsigned long long value) ++{ ++ char fname[MAX_PATH]; ++ char buf[20]; ++ int ret; ++ int fd; ++ ++ snprintf(fname, MAX_PATH, "%s/%s", path, name); ++ ++ fd = open(fname, O_WRONLY); ++ if (fd < 0) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ snprintf(buf, sizeof(buf), "0x%llx\n", value); ++ ret = write(fd, buf, strlen(buf)); ++ if (ret <= 0) ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n", ++ fname, value, strerror(errno)); ++ ++ close(fd); ++ return ret > 0 ? 0 : -errno; ++} ++ ++static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err, char *path) ++{ ++ unsigned long long paddr; ++ int ret; ++ ++ paddr = err->reg_array[HISI_HBM_ACLS_ADDH]; ++ paddr <<= HISI_TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[HISI_HBM_ACLS_ADDL]; ++ ++ ret = write_file(path, "acls_query", paddr); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file(path, "acls_repair", paddr); ++ if (ret < 0 && hisi_hbm_isolation_page_en) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep page offline\n"); ++ /* not much we can do about errors here */ ++ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); ++ return ret; ++ } ++ ++ ret = write_file("/sys/kernel/page_eject", "online_page", paddr); ++ return ret < 0 ? ret : 0; ++} ++ ++static int hisi_hbmc_get_memory_type(char *path) ++{ ++ int type = HISI_HBM_UNKNOWN; ++ char fname[MAX_PATH]; ++ char buf[128]; ++ FILE *file; ++ ++ snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type"); ++ file = fopen(fname, "r"); ++ if (!file) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ if (!fgets(buf, sizeof(buf), file)) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to read %s\n", fname); ++ goto err; ++ } ++ ++ /* Remove the last '\n' */ ++ buf[strlen(buf) - 1] = 0; ++ ++ if (strcmp(buf, "HBM") == 0) ++ type = HISI_HBM_HBM_MEMORY; ++ else if (strcmp(buf, "DDR") == 0) ++ type = HISI_HBM_DDR_MEMORY; ++ ++err: ++ fclose(file); ++ return type; ++} ++ ++static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err) ++{ ++ char *sys_dev_path = "/sys/devices/platform"; ++ char path[MAX_PATH]; ++ struct dirent *dent; ++ DIR *dir; ++ int ret; ++ ++ dir = opendir(sys_dev_path); ++ if (!dir) { ++ log(TERM, LOG_WARNING, "HiSilicon Memory RAS: can't read '%s': %s\n", ++ sys_dev_path, strerror(errno)); ++ return; ++ } ++ ++ while ((dent = readdir(dir))) { ++ if (!strstr(dent->d_name, HISI_HBM_MEM_RAS_NAME)) ++ continue; ++ ++ snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name); ++ ++ if (hisi_hbmc_get_memory_type(path) == HISI_HBM_HBM_MEMORY && ++ err->reg_array[HISI_HBM_REPAIR_REQ_TYPE] & HISI_HBM_CE_ACLS) { ++ /* ++ * ENXIO means the memory @paddr does not belong to ++ * the HBMC, try the next one. ++ */ ++ ret = hisi_hbmc_hbm_acls(err, path); ++ if (ret != -ENXIO) ++ break; ++ } ++ } ++ ++ closedir(dir); ++} ++ ++static bool hisi_hbm_valid_acls_ras(const struct hisi_common_error_section *err) ++{ ++ if (err->module_id >= sizeof(module_name)/sizeof(char *)) ++ return false; ++ ++ if (strcmp(module_name[err->module_id], "HBMC") != 0 || ++ err->submodule_id != HISI_HBMC_SUBMOD_HBM_REPAIR) ++ return false; ++ ++ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE)) || ++ err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) { ++ log(TERM, LOG_WARNING, "HiSilicon Memory RAS: No valid address array length (%u)\n", ++ err->reg_array_size); ++ return false; ++ } ++ ++ return true; ++} ++#endif ++ + static int decode_hisi_common_section(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -393,6 +580,11 @@ static int decode_hisi_common_section(struct ras_events *ras, + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); + } + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++ if (hisi_hbm_acls_en && hisi_hbm_valid_acls_ras(err)) ++ hisi_hbm_acls_handler(err); ++#endif ++ + return 0; + } + +@@ -410,4 +602,8 @@ static void __attribute__((constructor)) hisi_ns_init(void) + + for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++) + register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]); ++ ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++ hisi_hbm_param_init(); ++#endif + } +-- +2.33.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 4d4cf28..ae2be5e 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 21 +Release: 22 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -60,6 +60,7 @@ Patch9008: 0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch Patch9009: add-dynamic-switch-of-ras-events-support-and-disable-block-rq-complete.patch Patch9010: fix-rasdaemon-print-loading-config-logs-multiple-times.patch Patch9011: 0001-rasdaemon-Fix-for-vendor-errors-are-not-recorded-in-.patch +Patch9012: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch %description The rasdaemon program is a daemon which monitors the platform @@ -113,6 +114,12 @@ if [ $1 -eq 0 ] ; then fi %changelog +* Sat Aug 31 2024 Junhao He - 0.6.7-22 +- Type:feature +- ID:NA +- SUG:NA +- DESC:Add HBM Memory ACLS support for HiSilicon + * Thu Apr 25 2024 yangjunshuo - 0.6.7-21 - Type:bugfix - ID:NA -- Gitee