diff --git a/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch new file mode 100644 index 0000000000000000000000000000000000000000..b66d92a458c78aa8712de15777b71233c0c7eb85 --- /dev/null +++ b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch @@ -0,0 +1,298 @@ +From 6e5f83712ee2e7af1272b8064dd965d423b97ce2 Mon Sep 17 00:00:00 2001 +From: Junhao He +Date: Sat, 31 Aug 2024 17:52:02 +0800 +Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon + +When a hardware error occurs in a cell of the HBM memory, the internal +SRAM of the memory controller is used to replace the faulty memory, this +method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS +RAS, and the rasdaemon record it and runs the ACLS to replace the faulty +memory. + +HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can +check which HBM cell the physical address belongs to and filter invalid +HBM addresses. Multiple RAS errors are reported if memory errors occur +in different HBM cells. + +The feature depends on the linux kernel CONFIG_HISI_MEM_RAS and +CONFIG_PAGE_EJECT. + +Signed-off-by: Junhao He +--- + configure.ac | 11 +++ + misc/rasdaemon.env | 7 +- + non-standard-hisilicon.c | 196 +++++++++++++++++++++++++++++++++++++++ + 3 files changed, 213 insertions(+), 1 deletion(-) + +diff --git a/configure.ac b/configure.ac +index d098fcf..30c90d2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -171,6 +171,16 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x + AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) + ++AC_ARG_ENABLE([hisi_hbm_memory_acls], ++ AS_HELP_STRING([--enable-hisi-hbm-memory-acls], [enable HiSilicon HBM Memory ACLS])) ++ ++AS_IF([test "x$enable_hisi_hbm_memory_acls" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_HISI_HBM_MEMORY_ACLS,1,"have HiSilicon HBM Memory ACLS") ++ AC_SUBST([WITH_HISI_HBM_MEMORY_ACLS]) ++]) ++AM_CONDITIONAL([WITH_HISI_HBM_MEMORY_ACLS], [test x$enable_hisi_hbm_memory_acls = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_HISI_HBM_MEMORY_ACLS], [USE_HISI_HBM_MEMORY_ACLS="yes"], [USE_HISI_HBM_MEMORY_ACLS="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -212,4 +222,5 @@ compile time options summary + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION ++ HISI HBM Memory ACLS: $USE_HISI_HBM_MEMORY_ACLS + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index ca12a1a..516c4ac 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -46,4 +46,9 @@ CPU_ISOLATION_CYCLE="24h" + CPU_ISOLATION_LIMIT="10" + + # Disable specified events by config +-DISABLE="block:block_rq_complete" +\ No newline at end of file ++DISABLE="block:block_rq_complete" ++ ++# Support the HBM Memory ACLS (Adaptive Cache Line Sparing) on HiSilicon platform (yes|no). ++HISI_HBM_MEMORY_ACLS="no" ++# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS (yes|no). ++HISI_HBM_ISOLATION_PAGE="no" +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 7296d28..2b176cd 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -19,6 +19,48 @@ + #define HISI_BUF_LEN 2048 + #define HISI_PCIE_INFO_BUF_LEN 256 + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++#include ++#include ++#include ++#include ++#include ++ ++#define HISI_HBM_MEM_RAS_NAME "HISI0521" ++#define HISI_HBM_UNKNOWN 0 ++#define HISI_HBM_HBM_MEMORY 1 ++#define HISI_HBM_DDR_MEMORY 2 ++ ++#define HISI_TYPE_UINT32_WIDTH 32 ++/* Specify the Hisilicon HBMC HBM repair requeset type */ ++#define HISI_HBM_REPAIR_REQ_TYPE 0 ++#define HISI_HBM_CE_ACLS BIT(0) ++#define HISI_HBM_ACLS_ADDL 1 ++#define HISI_HBM_ACLS_ADDH 2 ++#define HISI_HBM_ACLS_ARRAY_SIZE 12 ++#define HISI_HBMC_SUBMOD_HBM_REPAIR 6 ++ ++static bool hisi_hbm_acls_en; ++static bool hisi_hbm_isolation_page_en; ++ ++static void hisi_hbm_param_init(void) ++{ ++ char *env; ++ ++ env = getenv("HISI_HBM_MEMORY_ACLS"); ++ if (env && strcasecmp(env, "yes") == 0) { ++ log(TERM, LOG_INFO, "HiSilicon HBM Memory ACLS is enabled\n"); ++ hisi_hbm_acls_en = true; ++ } ++ ++ env = getenv("HISI_HBM_ISOLATION_PAGE"); ++ if (env && strcasecmp(env, "yes") == 0) { ++ log(TERM, LOG_INFO, "HiSilicon HBM ACLS page isolation is enabled\n"); ++ hisi_hbm_isolation_page_en = true; ++ } ++} ++#endif ++ + struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; +@@ -358,6 +400,151 @@ static int add_hisi_common_table(struct ras_events *ras, + return 0; + } + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++static int write_file(char *path, const char *name, unsigned long long value) ++{ ++ char fname[MAX_PATH]; ++ char buf[20]; ++ int ret; ++ int fd; ++ ++ snprintf(fname, MAX_PATH, "%s/%s", path, name); ++ ++ fd = open(fname, O_WRONLY); ++ if (fd < 0) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ snprintf(buf, sizeof(buf), "0x%llx\n", value); ++ ret = write(fd, buf, strlen(buf)); ++ if (ret <= 0) ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to set %s (0x%llx): %s\n", ++ fname, value, strerror(errno)); ++ ++ close(fd); ++ return ret > 0 ? 0 : -errno; ++} ++ ++static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err, char *path) ++{ ++ unsigned long long paddr; ++ int ret; ++ ++ paddr = err->reg_array[HISI_HBM_ACLS_ADDH]; ++ paddr <<= HISI_TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[HISI_HBM_ACLS_ADDL]; ++ ++ ret = write_file(path, "acls_query", paddr); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file(path, "acls_repair", paddr); ++ if (ret < 0 && hisi_hbm_isolation_page_en) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Keep page offline\n"); ++ /* not much we can do about errors here */ ++ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); ++ return ret; ++ } ++ ++ ret = write_file("/sys/kernel/page_eject", "online_page", paddr); ++ return ret < 0 ? ret : 0; ++} ++ ++static int hisi_hbmc_get_memory_type(char *path) ++{ ++ int type = HISI_HBM_UNKNOWN; ++ char fname[MAX_PATH]; ++ char buf[128]; ++ FILE *file; ++ ++ snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type"); ++ file = fopen(fname, "r"); ++ if (!file) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ if (!fgets(buf, sizeof(buf), file)) { ++ log(TERM, LOG_WARNING, "HiSilicon HBM ACLS: Failed to read %s\n", fname); ++ goto err; ++ } ++ ++ /* Remove the last '\n' */ ++ buf[strlen(buf) - 1] = 0; ++ ++ if (strcmp(buf, "HBM") == 0) ++ type = HISI_HBM_HBM_MEMORY; ++ else if (strcmp(buf, "DDR") == 0) ++ type = HISI_HBM_DDR_MEMORY; ++ ++err: ++ fclose(file); ++ return type; ++} ++ ++static void hisi_hbm_acls_handler(const struct hisi_common_error_section *err) ++{ ++ char *sys_dev_path = "/sys/devices/platform"; ++ char path[MAX_PATH]; ++ struct dirent *dent; ++ DIR *dir; ++ int ret; ++ ++ dir = opendir(sys_dev_path); ++ if (!dir) { ++ log(TERM, LOG_WARNING, "HiSilicon Memory RAS: can't read '%s': %s\n", ++ sys_dev_path, strerror(errno)); ++ return; ++ } ++ ++ while ((dent = readdir(dir))) { ++ if (!strstr(dent->d_name, HISI_HBM_MEM_RAS_NAME)) ++ continue; ++ ++ snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name); ++ ++ if (hisi_hbmc_get_memory_type(path) == HISI_HBM_HBM_MEMORY && ++ err->reg_array[HISI_HBM_REPAIR_REQ_TYPE] & HISI_HBM_CE_ACLS) { ++ /* ++ * ENXIO means the memory @paddr does not belong to ++ * the HBMC, try the next one. ++ */ ++ ret = hisi_hbmc_hbm_acls(err, path); ++ if (ret != -ENXIO) ++ break; ++ } ++ } ++ ++ closedir(dir); ++} ++ ++static bool hisi_hbm_valid_acls_ras(const struct hisi_common_error_section *err) ++{ ++ if (err->module_id >= sizeof(module_name)/sizeof(char *)) ++ return false; ++ ++ if (strcmp(module_name[err->module_id], "HBMC") != 0 || ++ err->submodule_id != HISI_HBMC_SUBMOD_HBM_REPAIR) ++ return false; ++ ++ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE)) || ++ err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) { ++ log(TERM, LOG_WARNING, "HiSilicon Memory RAS: No valid address array length (%u)\n", ++ err->reg_array_size); ++ return false; ++ } ++ ++ return true; ++} ++#endif ++ + static int decode_hisi_common_section(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -393,6 +580,11 @@ static int decode_hisi_common_section(struct ras_events *ras, + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); + } + ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++ if (hisi_hbm_acls_en && hisi_hbm_valid_acls_ras(err)) ++ hisi_hbm_acls_handler(err); ++#endif ++ + return 0; + } + +@@ -410,4 +602,8 @@ static void __attribute__((constructor)) hisi_ns_init(void) + + for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++) + register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]); ++ ++#ifdef HAVE_HISI_HBM_MEMORY_ACLS ++ hisi_hbm_param_init(); ++#endif + } +-- +2.33.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 4d4cf2847c0a93306f974cd1ea9412a23d7828b2..ae2be5e5710fe33277bcf5f593e5237917736aaa 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 21 +Release: 22 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -60,6 +60,7 @@ Patch9008: 0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch Patch9009: add-dynamic-switch-of-ras-events-support-and-disable-block-rq-complete.patch Patch9010: fix-rasdaemon-print-loading-config-logs-multiple-times.patch Patch9011: 0001-rasdaemon-Fix-for-vendor-errors-are-not-recorded-in-.patch +Patch9012: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch %description The rasdaemon program is a daemon which monitors the platform @@ -113,6 +114,12 @@ if [ $1 -eq 0 ] ; then fi %changelog +* Sat Aug 31 2024 Junhao He - 0.6.7-22 +- Type:feature +- ID:NA +- SUG:NA +- DESC:Add HBM Memory ACLS support for HiSilicon + * Thu Apr 25 2024 yangjunshuo - 0.6.7-21 - Type:bugfix - ID:NA