From 0219db1aeebc87df8e2475ab9ee2d10a2b4ae260 Mon Sep 17 00:00:00 2001 From: Junhao He Date: Thu, 9 Nov 2023 19:30:17 +0800 Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon When a hardware error occurs in a cell of the HBM memory, the internal SRAM of the memory controller is used to replace the faulty memory, this method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS RAS, and the rasdaemon record it and runs the ACLS to replace the faulty memory. HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can check which HBM cell the physical address belongs to and filter invalid HBM addresses. Multiple RAS errors are reported if memory errors occur in different HBM cells. The feature depends on the linux kernel CONFIG_HISI_HBMDEV [1] and CONFIG_HWPOISON_INJECT [2]. [1]: https://gitee.com/openeuler/kernel/pulls/2757 [2]: https://gitee.com/openeuler/kernel/blob/OLK-5.10/mm/hwpoison-inject.c Signed-off-by: Junhao He --- ...BM-Memory-ACLS-support-for-HiSilicon.patch | 191 ++++++++++++++++++ rasdaemon.spec | 9 +- 2 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch diff --git a/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch new file mode 100644 index 0000000..da67f24 --- /dev/null +++ b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch @@ -0,0 +1,191 @@ +From ac4419c6dfb1b0b527c959f1d8e8f690126a8507 Mon Sep 17 00:00:00 2001 +From: Junhao He +Date: Fri, 13 Oct 2023 18:10:16 +0800 +Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon + +When a hardware error occurs in a cell of the HBM memory, the internal +SRAM of the memory controller is used to replace the faulty memory, this +method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS +RAS, and the rasdaemon record it and runs the ACLS to replace the faulty +memory. + +HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can +check which HBM cell the physical address belongs to and filter invalid +HBM addresses. Multiple RAS errors are reported if memory errors occur +in different HBM cells. + +The feature depends on the linux kernel CONFIG_HISI_HBMDEV [1] and +CONFIG_HWPOISON_INJECT [2]. + +[1]: https://gitee.com/openeuler/kernel/pulls/2757 +[2]: https://gitee.com/openeuler/kernel/blob/OLK-5.10/mm/hwpoison-inject.c + +Signed-off-by: Junhao He +--- + misc/rasdaemon.env | 5 +- + non-standard-hisilicon.c | 111 ++++++++++++++++++++++++++++++++++++++- + 2 files changed, 114 insertions(+), 2 deletions(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 7cb18e8..cdc5cd1 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -43,4 +43,7 @@ CPU_CE_THRESHOLD="18" + CPU_ISOLATION_CYCLE="24h" + + # Prevent excessive isolation from causing an avalanche effect +-CPU_ISOLATION_LIMIT="10" +\ No newline at end of file ++CPU_ISOLATION_LIMIT="10" ++ ++# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS ++HISI_HMB_ISOLATION_PAGE="no" +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 756adf8..6566e69 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -8,9 +8,11 @@ + * + */ + ++#include + #include + #include + #include ++#include + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" +@@ -19,6 +21,17 @@ + #define HISI_BUF_LEN 2048 + #define HISI_PCIE_INFO_BUF_LEN 256 + ++#define HISI_TYPE_UINT32_WIDTH 32 ++/* Specify the Hisilicon HBMC HBM error type */ ++#define HISI_HBM_ERR_TYPE 0 ++#define HISI_HBM_ERR_ACLS BIT(0) ++ ++#define HISI_HBM_ACLS_ADDL 1 ++#define HISI_HBM_ACLS_ADDH 2 ++#define HISI_HBM_ACLS_ARRAY_SIZE 12 ++ ++#define HISI_SUBMOD_HBMC_HBM 6 ++ + struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; +@@ -360,6 +373,102 @@ static int add_hisi_common_table(struct ras_events *ras, + return 0; + } + ++static int write_file(const char *name, unsigned long long value) ++{ ++ char fname[MAX_PATH + 1] = "/sys/kernel/"; ++ FILE *file; ++ int ret; ++ ++ strcat(fname, name); ++ if (access(fname, W_OK)) { ++ log(TERM, LOG_WARNING, "Cannot access '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ file = fopen(fname, "w"); ++ if (!file) { ++ log(TERM, LOG_WARNING, "Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ ret = fprintf(file, "0x%llx\n", value); ++ if (ret < 0) ++ log(TERM, LOG_WARNING, "Failed to set %s (0x%llx): %s\n", ++ fname, value, strerror(errno)); ++ ++ fclose(file); ++ return ret; ++} ++ ++static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err, ++ int page_size) ++{ ++ unsigned long long paddr; ++ unsigned long long pfn; ++ int ret; ++ ++ if (err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) { ++ log(TERM, LOG_WARNING, "No valid address array length (%d)\n", ++ err->reg_array_size); ++ return -1; ++ } ++ ++ if (!page_size) ++ return -1; ++ ++ paddr = err->reg_array[HISI_HBM_ACLS_ADDH]; ++ paddr <<= HISI_TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[HISI_HBM_ACLS_ADDL]; ++ pfn = paddr / page_size; ++ ++ ret = write_file("hbm_memory/acls/acls_query", paddr); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("debug/hwpoison/corrupt-pfn", pfn); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("hbm_memory/acls/acls_repair", paddr); ++ if (ret < 0 && strcmp("HISI_HMB_ISOLATION_PAGE", "yes") == 0) ++ return ret; /* Keep the memory offline */ ++ ++ ret = write_file("debug/hwpoison/unpoison-pfn", pfn); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static int hisi_hbmc_hbm_handler(const struct hisi_common_error_section *err, ++ int page_size) ++{ ++ int ret = 0; ++ ++ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE))) ++ return -1; ++ ++ /* Hisilicon HMB Memory ACLS */ ++ if (err->reg_array[HISI_HBM_ERR_TYPE] & HISI_HBM_ERR_ACLS) ++ ret = hisi_hbmc_hbm_acls(err, page_size); ++ ++ return ret; ++} ++ ++static int hisi_common_event_handler(struct ras_events *ras, ++ const struct hisi_common_error_section *err) ++{ ++ int ret = 0; ++ ++ if (!strcmp(module_name[err->module_id], "HBMC") && ++ err->submodule_id == HISI_SUBMOD_HBMC_HBM) ++ ret = hisi_hbmc_hbm_handler(err, ras->page_size); ++ ++ return ret; ++} ++ + static int decode_hisi_common_section(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -395,7 +504,7 @@ static int decode_hisi_common_section(struct ras_events *ras, + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); + } + +- return 0; ++ return hisi_common_event_handler(ras, err); + } + + static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { +-- +2.30.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index a7dbdc9..5c9d957 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.8 -Release: 5 +Release: 6 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -43,6 +43,7 @@ Patch9016: 0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch Patch9017: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch Patch9018: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch Patch9019: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch +Patch9020: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch %description The rasdaemon program is a daemon which monitors the platform @@ -88,6 +89,12 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Thu Nov 09 2023 Junhao He - 0.6.8-6 +- Type:feature +- ID:NA +- SUG:NA +- DESC:Add HBM Memory ACLS support for HiSilicon + * Sat Jun 17 2023 yanglongkang - 0.6.8-5 - Type:bugfix - ID:NA -- Gitee