From ff202c8a3718b71a55110724269689c76091b9bc Mon Sep 17 00:00:00 2001 From: JiangShui <1175135535@qq.com> Date: Thu, 23 Nov 2023 17:18:38 +0800 Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon and fix a issue Patch#1: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch Add HBM Memory ACLS support for HiSilicon. Patch#2: 0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch Fix "nimbus_id=0" field is not displayed when query RAS error information. Signed-off-by: Cai Jian --- ...BM-Memory-ACLS-support-for-HiSilicon.patch | 191 ++++++++++++++++++ ...-ctl-Modify-check-for-HiSilicon-KunP.patch | 122 +++++++++++ rasdaemon.spec | 12 +- 3 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch create mode 100644 0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch diff --git a/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch new file mode 100644 index 0000000..da67f24 --- /dev/null +++ b/0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch @@ -0,0 +1,191 @@ +From ac4419c6dfb1b0b527c959f1d8e8f690126a8507 Mon Sep 17 00:00:00 2001 +From: Junhao He +Date: Fri, 13 Oct 2023 18:10:16 +0800 +Subject: [PATCH] rasdaemon: Add HBM Memory ACLS support for HiSilicon + +When a hardware error occurs in a cell of the HBM memory, the internal +SRAM of the memory controller is used to replace the faulty memory, this +method is ACLS (Adaptive Cache Line Sparing). The IMU reports the ACLS +RAS, and the rasdaemon record it and runs the ACLS to replace the faulty +memory. + +HBM ACLS can repair one cell (258-bit) memory at a time. The HBM can +check which HBM cell the physical address belongs to and filter invalid +HBM addresses. Multiple RAS errors are reported if memory errors occur +in different HBM cells. + +The feature depends on the linux kernel CONFIG_HISI_HBMDEV [1] and +CONFIG_HWPOISON_INJECT [2]. + +[1]: https://gitee.com/openeuler/kernel/pulls/2757 +[2]: https://gitee.com/openeuler/kernel/blob/OLK-5.10/mm/hwpoison-inject.c + +Signed-off-by: Junhao He +--- + misc/rasdaemon.env | 5 +- + non-standard-hisilicon.c | 111 ++++++++++++++++++++++++++++++++++++++- + 2 files changed, 114 insertions(+), 2 deletions(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 7cb18e8..cdc5cd1 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -43,4 +43,7 @@ CPU_CE_THRESHOLD="18" + CPU_ISOLATION_CYCLE="24h" + + # Prevent excessive isolation from causing an avalanche effect +-CPU_ISOLATION_LIMIT="10" +\ No newline at end of file ++CPU_ISOLATION_LIMIT="10" ++ ++# Specify rasdaemon to isolation the error page which fails to be repaired by HiSilicon HBM ACLS ++HISI_HMB_ISOLATION_PAGE="no" +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 756adf8..6566e69 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -8,9 +8,11 @@ + * + */ + ++#include + #include + #include + #include ++#include + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" +@@ -19,6 +21,17 @@ + #define HISI_BUF_LEN 2048 + #define HISI_PCIE_INFO_BUF_LEN 256 + ++#define HISI_TYPE_UINT32_WIDTH 32 ++/* Specify the Hisilicon HBMC HBM error type */ ++#define HISI_HBM_ERR_TYPE 0 ++#define HISI_HBM_ERR_ACLS BIT(0) ++ ++#define HISI_HBM_ACLS_ADDL 1 ++#define HISI_HBM_ACLS_ADDH 2 ++#define HISI_HBM_ACLS_ARRAY_SIZE 12 ++ ++#define HISI_SUBMOD_HBMC_HBM 6 ++ + struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; +@@ -360,6 +373,102 @@ static int add_hisi_common_table(struct ras_events *ras, + return 0; + } + ++static int write_file(const char *name, unsigned long long value) ++{ ++ char fname[MAX_PATH + 1] = "/sys/kernel/"; ++ FILE *file; ++ int ret; ++ ++ strcat(fname, name); ++ if (access(fname, W_OK)) { ++ log(TERM, LOG_WARNING, "Cannot access '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ file = fopen(fname, "w"); ++ if (!file) { ++ log(TERM, LOG_WARNING, "Cannot to open '%s': %s\n", ++ fname, strerror(errno)); ++ return -errno; ++ } ++ ++ ret = fprintf(file, "0x%llx\n", value); ++ if (ret < 0) ++ log(TERM, LOG_WARNING, "Failed to set %s (0x%llx): %s\n", ++ fname, value, strerror(errno)); ++ ++ fclose(file); ++ return ret; ++} ++ ++static int hisi_hbmc_hbm_acls(const struct hisi_common_error_section *err, ++ int page_size) ++{ ++ unsigned long long paddr; ++ unsigned long long pfn; ++ int ret; ++ ++ if (err->reg_array_size < HISI_HBM_ACLS_ARRAY_SIZE) { ++ log(TERM, LOG_WARNING, "No valid address array length (%d)\n", ++ err->reg_array_size); ++ return -1; ++ } ++ ++ if (!page_size) ++ return -1; ++ ++ paddr = err->reg_array[HISI_HBM_ACLS_ADDH]; ++ paddr <<= HISI_TYPE_UINT32_WIDTH; ++ paddr += err->reg_array[HISI_HBM_ACLS_ADDL]; ++ pfn = paddr / page_size; ++ ++ ret = write_file("hbm_memory/acls/acls_query", paddr); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("debug/hwpoison/corrupt-pfn", pfn); ++ if (ret < 0) ++ return ret; ++ ++ ret = write_file("hbm_memory/acls/acls_repair", paddr); ++ if (ret < 0 && strcmp("HISI_HMB_ISOLATION_PAGE", "yes") == 0) ++ return ret; /* Keep the memory offline */ ++ ++ ret = write_file("debug/hwpoison/unpoison-pfn", pfn); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static int hisi_hbmc_hbm_handler(const struct hisi_common_error_section *err, ++ int page_size) ++{ ++ int ret = 0; ++ ++ if (!(err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE))) ++ return -1; ++ ++ /* Hisilicon HMB Memory ACLS */ ++ if (err->reg_array[HISI_HBM_ERR_TYPE] & HISI_HBM_ERR_ACLS) ++ ret = hisi_hbmc_hbm_acls(err, page_size); ++ ++ return ret; ++} ++ ++static int hisi_common_event_handler(struct ras_events *ras, ++ const struct hisi_common_error_section *err) ++{ ++ int ret = 0; ++ ++ if (!strcmp(module_name[err->module_id], "HBMC") && ++ err->submodule_id == HISI_SUBMOD_HBMC_HBM) ++ ret = hisi_hbmc_hbm_handler(err, ras->page_size); ++ ++ return ret; ++} ++ + static int decode_hisi_common_section(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -395,7 +504,7 @@ static int decode_hisi_common_section(struct ras_events *ras, + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); + } + +- return 0; ++ return hisi_common_event_handler(ras, err); + } + + static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { +-- +2.30.0 + diff --git a/0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch b/0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch new file mode 100644 index 0000000..55f106b --- /dev/null +++ b/0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch @@ -0,0 +1,122 @@ +From 3576ebb2e0e7badb475807058776de748bbc8c43 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 24 Aug 2023 13:07:17 +0100 +Subject: [PATCH] rasdaemon: ras-mc-ctl: Modify check for HiSilicon KunPeng9xx + error fields + +Modify check for valid HiSilicon KunPeng9xx error fields. +Fixes an error data is not printed when it's value is 0. + +Signed-off-by: Shiju Jose +Signed-off-by: Mauro Carvalho Chehab +--- + util/ras-mc-ctl.in | 72 +++++++++++++++++++++++----------------------- + 1 file changed, 36 insertions(+), 36 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 4178dcf..07e6fca 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1672,13 +1672,13 @@ sub vendor_errors + if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs " if ($regs); ++ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); ++ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); ++ $out .= "module_id=$module_id, " if (defined $module_id && length $module_id); ++ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); ++ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); ++ $out .= "Error Registers: $regs " if (defined $regs && length $regs); + $out .= "\n\n"; + $found_module = 1; + } +@@ -1697,13 +1697,13 @@ sub vendor_errors + if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs " if ($regs); ++ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); ++ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); ++ $out .= "module_id=$module_id, " if (defined $module_id && length $module_id); ++ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); ++ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); ++ $out .= "Error Registers: $regs " if (defined $regs && length $regs); + $out .= "\n\n"; + $found_module = 1; + } +@@ -1722,15 +1722,15 @@ sub vendor_errors + if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "core_id=$core_id, " if ($core_id); +- $out .= "port_id=$port_id, " if ($port_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "err_type=$err_type, " if ($err_type); +- $out .= "Error Registers: $regs " if ($regs); ++ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); ++ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); ++ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); ++ $out .= "core_id=$core_id, " if (defined $core_id && length $core_id); ++ $out .= "port_id=$port_id, " if (defined $port_id && length $port_id); ++ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); ++ $out .= "err_type=$err_type, " if (defined $err_type && length $err_type); ++ $out .= "Error Registers: $regs " if (defined $regs && length $regs); + $out .= "\n\n"; + $found_module = 1; + } +@@ -1749,19 +1749,19 @@ sub vendor_errors + if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "totem_id=$totem_id, " if ($totem_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "core_id=$core_id, " if ($core_id); +- $out .= "port_id=$port_id, " if ($port_id); +- $out .= "err_type=$err_type, " if ($err_type); +- $out .= "pcie_info=$pcie_info, " if ($pcie_info); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs" if ($regs); ++ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); ++ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); ++ $out .= "totem_id=$totem_id, " if (defined $totem_id && length $totem_id); ++ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); ++ $out .= "sub_system_id=$sub_system_id, " if (defined $sub_system_id && length $sub_system_id); ++ $out .= "module_id=$module_id, " if (defined $module_id && length $module_id); ++ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); ++ $out .= "core_id=$core_id, " if (defined $core_id && length $core_id ); ++ $out .= "port_id=$port_id, " if (defined $port_id && length $port_id); ++ $out .= "err_type=$err_type, " if (defined $err_type && length $err_type); ++ $out .= "pcie_info=$pcie_info, " if (defined $pcie_info && length $pcie_info); ++ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); ++ $out .= "Error Registers: $regs" if (defined $regs && length $regs); + $out .= "\n\n"; + $found_module = 1; + } +-- +2.25.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 4189592..ba2f5db 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 13 +Release: 14 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -53,6 +53,8 @@ Patch9004: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch Patch9005: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch Patch9006: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch Patch9007: fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch +Patch9008: 0001-rasdaemon-Add-HBM-Memory-ACLS-support-for-HiSilicon.patch +Patch9009: 0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch %description The rasdaemon program is a daemon which monitors the platform @@ -98,6 +100,14 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Thu Nov 23 2023 Cai Jian - 0.6.7-14 +- Type:feature & bugfix +- ID:NA +- SUG:NA +- DESC: + 1. Add HBM Memory ACLS support for HiSilicon. + 2. Fix "nimbus_id=0" field is not displayed when query RAS error information. + * Tue Jun 20 2023 zhangnan - 0.6.7-13 - Type:bugfix - ID:NA -- Gitee