From 9936cc4a0aaa049866f555868360aa0aedcb4460 Mon Sep 17 00:00:00 2001 From: Renbo Date: Thu, 5 Dec 2024 14:06:42 +0800 Subject: [PATCH 1/4] [BA] update to rasdaemon-0.6.7-15.src.rpm to #rasdaemon update to rasdaemon-0.6.7-15.src.rpm Signed-off-by: Renbo --- ...tification-support-when-page-goes-of.patch | 222 ------ ...a59ee33b7448b00d7ba13d5ecd4918b9853c.patch | 163 +++++ ...e.ac-fix-SYSCONFDEFDIR-default-value.patch | 37 - ...-non_standard_event-at-just-one-line.patch | 48 -- ...pport-for-THead-Yitian-non-standard-.patch | 409 ----------- ...-ctl-Add-support-to-display-the-THea.patch | 105 --- ...882a0cbfce0b905039bebc811ac8311cd739.patch | 105 +++ ...-add-mc_event-and-mce_record-trigger.patch | 660 ------------------ ...rocess-Ampere-specific-error-in-the-.patch | 104 --- ...e-issue-of-sprintf-data-type-mismatc.patch | 56 -- ...daemon-ensure-trace_clock-file-exist.patch | 54 -- ...ce_record-print-just-one-line-on-AMD.patch | 33 - ...rasdaemon-disable-ce-offline-default.patch | 48 -- ...8ef8d7aebc3e5201bf39b73ce7644f8e419e.patch | 524 ++++++++++++++ ...177ce0d2fcb7693cacee4778d0845ebd3788.patch | 93 +++ ...da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch | 34 + ...546add918457c453bd3f753ac7df90b39e36.patch | 22 + ...18b04a04104dfac6b8536419803f236e6118.patch | 411 +++++++++++ ...4aef87978b806178a73ed33c39d6c442fc1f.patch | 24 + ...f6255f67a8bae28cd46c54500fc16bfc7a30.patch | 117 ++++ ...c96cd52d775570dae989dd95a060f1149077.patch | 159 +++++ ...4416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch | 208 ++++++ ...d309dcbdeb7ecd219975244f3944a8d047e9.patch | 37 + ...15cf8146f51b5d6fe7a29107a2adc77407ca.patch | 94 +++ rasdaemon.spec | 118 ++-- 25 files changed, 2059 insertions(+), 1826 deletions(-) delete mode 100644 1001-rasdaemon-Add-notification-support-when-page-goes-of.patch create mode 100644 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch delete mode 100644 2001-configure.ac-fix-SYSCONFDEFDIR-default-value.patch delete mode 100644 2002-rasdaemon-log-non_standard_event-at-just-one-line.patch delete mode 100644 2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch delete mode 100644 2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch create mode 100644 2d15882a0cbfce0b905039bebc811ac8311cd739.patch delete mode 100644 3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch delete mode 100644 3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch delete mode 100644 3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch delete mode 100644 3004-rasdaemon-ensure-trace_clock-file-exist.patch delete mode 100644 3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch delete mode 100644 3006-rasdaemon-disable-ce-offline-default.patch create mode 100644 30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch create mode 100644 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch create mode 100644 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch create mode 100644 885e546add918457c453bd3f753ac7df90b39e36.patch create mode 100644 932118b04a04104dfac6b8536419803f236e6118.patch create mode 100644 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch create mode 100644 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch create mode 100644 aa36c96cd52d775570dae989dd95a060f1149077.patch create mode 100644 b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch create mode 100644 c785d309dcbdeb7ecd219975244f3944a8d047e9.patch create mode 100644 ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch diff --git a/1001-rasdaemon-Add-notification-support-when-page-goes-of.patch b/1001-rasdaemon-Add-notification-support-when-page-goes-of.patch deleted file mode 100644 index 736fea3..0000000 --- a/1001-rasdaemon-Add-notification-support-when-page-goes-of.patch +++ /dev/null @@ -1,222 +0,0 @@ -diff -Nur rasdaemon-0.6.7/Makefile.am rasdaemon-0.6.7_new/Makefile.am ---- rasdaemon-0.6.7/Makefile.am 2023-06-02 15:14:06.995338446 +0800 -+++ rasdaemon-0.6.7_new/Makefile.am 2023-06-02 15:14:33.789545754 +0800 -@@ -2,7 +2,7 @@ - SUBDIRS = libtrace util man - SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in - SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) --EXTRA_DIST = $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env -+EXTRA_DIST = $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env misc/notices - - # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin - # during ./configure phase, therefore it is not possible to add .service.in -diff -Nur rasdaemon-0.6.7/misc/notices/page-ce-offline-post-notice rasdaemon-0.6.7_new/misc/notices/page-ce-offline-post-notice ---- rasdaemon-0.6.7/misc/notices/page-ce-offline-post-notice 1970-01-01 08:00:00.000000000 +0800 -+++ rasdaemon-0.6.7_new/misc/notices/page-ce-offline-post-notice 2023-06-02 15:16:14.456324620 +0800 -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon after a page goes offline. -+ -+cd `dirname $0` -+ -+[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1 -+ -+if [ -d page-ce-offline-post-notice.extern ] -+then -+ ls page-ce-offline-post-notice.extern | -+ while read item -+ do -+ [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1 -+ done -+fi -+ -+exit 0 -diff -Nur rasdaemon-0.6.7/misc/notices/page-ce-offline-pre-notice rasdaemon-0.6.7_new/misc/notices/page-ce-offline-pre-notice ---- rasdaemon-0.6.7/misc/notices/page-ce-offline-pre-notice 1970-01-01 08:00:00.000000000 +0800 -+++ rasdaemon-0.6.7_new/misc/notices/page-ce-offline-pre-notice 2023-06-02 15:16:39.440517924 +0800 -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon before a page goes offline. -+ -+cd `dirname $0` -+ -+[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1 -+ -+if [ -d page-ce-offline-pre-notice.extern ] -+then -+ ls page-ce-offline-pre-notice.extern | -+ while read item -+ do -+ [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1 -+ done -+fi -+ -+exit 0 -diff -Nur rasdaemon-0.6.7/misc/rasdaemon.env rasdaemon-0.6.7_new/misc/rasdaemon.env ---- rasdaemon-0.6.7/misc/rasdaemon.env 2023-06-02 15:14:06.994338438 +0800 -+++ rasdaemon-0.6.7_new/misc/rasdaemon.env 2023-06-02 15:17:54.307097173 +0800 -@@ -27,3 +27,7 @@ - # soft-then-hard First try to soft offline, then try hard offlining. - # Note: default offline choice is "soft". - PAGE_CE_ACTION="soft" -+ -+# Notices script when doing memory offline -+PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" -+PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" -diff -Nur rasdaemon-0.6.7/misc/rasdaemon.spec.in rasdaemon-0.6.7_new/misc/rasdaemon.spec.in ---- rasdaemon-0.6.7/misc/rasdaemon.spec.in 2023-06-02 15:14:06.994338438 +0800 -+++ rasdaemon-0.6.7_new/misc/rasdaemon.spec.in 2023-06-02 15:19:03.105629470 +0800 -@@ -46,6 +46,8 @@ - make install DESTDIR=%{buildroot} - install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service - install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service -+install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ -+install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ - rm INSTALL %{buildroot}/usr/include/*.h - - %files -@@ -57,6 +59,7 @@ - %{_sysconfdir}/ras/dimm_labels.d - @SYSCONFDEFDIR@/%{name} - %config(noreplace) @SYSCONFDEFDIR@/%{name} -+%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* - - %changelog - -diff -Nur rasdaemon-0.6.7/ras-page-isolation.c rasdaemon-0.6.7_new/ras-page-isolation.c ---- rasdaemon-0.6.7/ras-page-isolation.c 2023-06-02 15:14:06.995338446 +0800 -+++ rasdaemon-0.6.7_new/ras-page-isolation.c 2023-06-02 16:06:28.020663355 +0800 -@@ -17,12 +17,16 @@ - #include - #include - #include -+#include -+#include -+#include - #include - #include - #include - #include "ras-logger.h" - #include "ras-page-isolation.h" - -+#define MAX_PATH_LEN 64 - #define PARSED_ENV_LEN 50 - static const struct config threshold_units[] = { - { "m", 1000 }, -@@ -76,6 +80,8 @@ - - static enum otype offline = OFFLINE_SOFT; - static struct rb_root page_records; -+static char pre_notice[MAX_PATH_LEN]; -+static char post_notice[MAX_PATH_LEN]; - - static void page_offline_init(void) - { -@@ -205,16 +211,94 @@ - threshold_string, cycle_string); - } - -+static void page_notice_init(void) -+{ -+ char *notice_root = "/etc/rasdaemon_notices"; -+ char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); -+ char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); -+ -+ if (offline <= OFFLINE_ACCOUNT) -+ return; -+ -+ snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); -+ if (access(pre_notice, R_OK|X_OK) < 0) -+ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", pre_notice); -+ -+ snprintf(post_notice, sizeof(post_notice), "%s/%s", notice_root, post_re); -+ if (access(post_notice, R_OK|X_OK) < 0) -+ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", post_notice); -+} -+ - void ras_page_account_init(void) - { - page_offline_init(); - page_isolation_init(); -+ page_notice_init(); -+} -+ -+static void finish_child(pid_t child, int status) -+{ -+ if (WIFEXITED(status) && WEXITSTATUS(status)) { -+ log(TERM, LOG_INFO, "notice exited with status %d\n", WEXITSTATUS(status)); -+ } else if (WIFSIGNALED(status)) { -+ log(TERM, LOG_INFO,"notice died with signal %s\n", strsignal(WTERMSIG(status))); -+ } -+ -+ return; - } - -+static void __run_notice(char *argv[], char **env) -+{ -+ pid_t child; -+ int status; -+ -+ child = fork(); -+ if (child < 0) { -+ log(TERM, LOG_ERR, "Cannot create process for offline notice"); -+ return; -+ } -+ if (child == 0) { -+ execve(argv[0], argv, env); -+ _exit(127); -+ } -+ else { -+ waitpid(child, &status, 0); -+ finish_child(child, status); -+ } -+} -+ -+static void run_notice(char *argv[]) -+{ -+ int MAX_ENV = 20; -+ char *env[MAX_ENV]; -+ int ei = 0; -+ int i; -+ -+ asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin"); -+ env[ei] = NULL; -+ assert(ei < MAX_ENV); -+ -+ __run_notice(argv, env); -+ -+ for (i = 0; i < ei; i++) -+ free(env[i]); -+ } -+ - static int do_page_offline(unsigned long long addr, enum otype type) - { - int fd, rc; - char buf[20]; -+ char *args; -+ char *argv[] = { -+ NULL, -+ NULL, -+ NULL, -+ }; -+ -+ asprintf(&args, "%llu", addr); -+ argv[0] = (char*)&pre_notice; -+ argv[1] = args; -+ run_notice(argv); - - fd = open(kernel_offline[type], O_WRONLY); - if (fd == -1) { -@@ -228,6 +312,12 @@ - log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno); - } - close(fd); -+ -+ argv[0] = (char*)&post_notice; -+ run_notice(argv); -+ -+ free(args); -+ - return rc; - } - diff --git a/1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch b/1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch new file mode 100644 index 0000000..e0cb4a2 --- /dev/null +++ b/1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch @@ -0,0 +1,163 @@ +commit 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c +Author: Muralidhara M K +Date: Fri Jun 30 10:36:53 2023 +0000 + + rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types. + + Add HWID and McaType values for new SMCA bank types + and error decoding for those new SMCA banks. + + Signed-off-by: Muralidhara M K + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 7c88a46..fc51b5a 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -61,6 +61,7 @@ enum smca_bank_types { + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_UMC_V2, ++ SMCA_MA_LLC, /* Memory Attached Last Level Cache */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ + SMCA_PSP_V2, +@@ -76,6 +77,8 @@ enum smca_bank_types { + SMCA_SHUB, /* System Hub Unit */ + SMCA_SATA, /* SATA Unit */ + SMCA_USB, /* USB Unit */ ++ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */ ++ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */ + SMCA_GMI_PCS, /* GMI PCS Unit */ + SMCA_XGMI_PHY, /* xGMI PHY Unit */ + SMCA_WAFL_PHY, /* WAFL PHY Unit */ +@@ -325,6 +328,16 @@ static const char * const smca_umc2_mce_desc[] = { + "LM32 MP errors", + }; + ++static const char * const smca_mall_mce_desc[] = { ++ "Counter overflow error", ++ "Counter underflow error", ++ "Write Data Parity Error", ++ "Read Response Parity Error", ++ "Cache Tag ECC Error Macro 0", ++ "Cache Tag ECC Error Macro 1", ++ "Cache Data ECC Error" ++}; ++ + static const char * const smca_pb_mce_desc[] = { + "An ECC error in the Parameter Block RAM array" + }; +@@ -524,6 +537,57 @@ static const char * const smca_usb_mce_desc[] = { + "AXI Slave Response error", + }; + ++static const char * const smca_usrdp_mce_desc[] = { ++ "Mst CMD Error", ++ "Mst Rx FIFO Error", ++ "Mst Deskew Error", ++ "Mst Detect Timeout Error", ++ "Mst FlowControl Error", ++ "Mst DataValid FIFO Error", ++ "Mac LinkState Error", ++ "Deskew Error", ++ "Init Timeout Error", ++ "Init Attempt Error", ++ "Recovery Timeout Error", ++ "Recovery Attempt Error", ++ "Eye Training Timeout Error", ++ "Data Startup Limit Error", ++ "LS0 Exit Error", ++ "PLL powerState Update Timeout Error", ++ "Rx FIFO Error", ++ "Lcu Error", ++ "Conv CECC Error", ++ "Conv UECC Error", ++ "Reserved", ++ "Rx DataLoss Error", ++ "Replay CECC Error", ++ "Replay UECC Error", ++ "CRC Error", ++ "BER Exceeded Error", ++ "FC Init Timeout Error", ++ "FC Init Attempt Error", ++ "Replay Timeout Error", ++ "Replay Attempt Error", ++ "Replay Underflow Error", ++ "Replay Overflow Error", ++}; ++ ++static const char * const smca_usrcp_mce_desc[] = { ++ "Packet Type Error", ++ "Rx FIFO Error", ++ "Deskew Error", ++ "Rx Detect Timeout Error", ++ "Data Parity Error", ++ "Data Loss Error", ++ "Lcu Error", ++ "HB1 Handshake Timeout Error", ++ "HB2 Handshake Timeout Error", ++ "Clk Sleep Rsp Timeout Error", ++ "Clk Wake Rsp Timeout Error", ++ "Reset Attack Error", ++ "Remote Link Fatal Error", ++}; ++ + static const char * const smca_gmipcs_mce_desc[] = { + "Data Loss Error", + "Training Error", +@@ -579,6 +643,7 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, + [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, + [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, ++ [SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) }, + [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, + [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, +@@ -595,6 +660,8 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) }, + [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) }, + [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) }, ++ [SMCA_USR_DP] = { smca_usrdp_mce_desc, ARRAY_SIZE(smca_usrdp_mce_desc) }, ++ [SMCA_USR_CP] = { smca_usrcp_mce_desc, ARRAY_SIZE(smca_usrcp_mce_desc) }, + [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) }, + /* All the PHY bank types have the same error descriptions, for now. */ + [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, +@@ -631,6 +698,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + { SMCA_UMC, 0x00000096 }, + /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */ + { SMCA_UMC_V2, 0x00010096 }, ++ /* Memory Attached Last Level Cache */ ++ { SMCA_MA_LLC, 0x0004002E }, + + /* Parameter Block MCA type */ + { SMCA_PB, 0x00000005 }, +@@ -664,6 +733,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + { SMCA_SHUB, 0x00000080 }, + { SMCA_SATA, 0x000000A8 }, + { SMCA_USB, 0x000000AA }, ++ ++ /* Ultra Short Reach Data and Control Plane Controller */ ++ { SMCA_USR_DP, 0x00000170 }, ++ { SMCA_USR_CP, 0x00000180 }, ++ + { SMCA_GMI_PCS, 0x00000241 }, + + /* Ext Global Memory Interconnect PHY MCA type */ +@@ -692,6 +766,7 @@ static struct smca_bank_name smca_names[] = { + [SMCA_PIE] = { "Power, Interrupts, etc." }, + [SMCA_UMC] = { "Unified Memory Controller" }, + [SMCA_UMC_V2] = { "Unified Memory Controller V2" }, ++ [SMCA_MA_LLC] = { "Memory Attached Last Level Cache" }, + [SMCA_PB] = { "Parameter Block" }, + [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" }, + [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" }, +@@ -704,6 +779,8 @@ static struct smca_bank_name smca_names[] = { + [SMCA_SHUB] = { "System Hub Unit" }, + [SMCA_SATA] = { "SATA Unit" }, + [SMCA_USB] = { "USB Unit" }, ++ [SMCA_USR_DP] = { "Ultra Short Reach Data Plane Controller" }, ++ [SMCA_USR_CP] = { "Ultra Short Reach Control Plane Controller" }, + [SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" }, + [SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" }, + [SMCA_WAFL_PHY] = { "WAFL PHY Unit" }, diff --git a/2001-configure.ac-fix-SYSCONFDEFDIR-default-value.patch b/2001-configure.ac-fix-SYSCONFDEFDIR-default-value.patch deleted file mode 100644 index 7d6ac4f..0000000 --- a/2001-configure.ac-fix-SYSCONFDEFDIR-default-value.patch +++ /dev/null @@ -1,37 +0,0 @@ -From 8f44bff597adcfb34c73e7477d1a867516e7fbfe Mon Sep 17 00:00:00 2001 -From: Matt Whitlock -Date: Wed, 9 Jun 2021 10:25:18 -0400 -Subject: [PATCH 1/4] configure.ac: fix SYSCONFDEFDIR default value - -configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like: - - # Check whether --with-sysconfdefdir was given. - if test "${with_sysconfdefdir+set}" = set; then : - withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval - else - "/etc/sysconfig" - fi - -This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command. - -Signed-off-by: Mauro Carvalho Chehab ---- - configure.ac | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/configure.ac b/configure.ac -index f7d1947..33b81fe 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR]) - AC_ARG_WITH(sysconfdefdir, - AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]), - [SYSCONFDEFDIR=$withval], -- ["/etc/sysconfig"]) -+ [SYSCONFDEFDIR=/etc/sysconfig]) - AC_SUBST([SYSCONFDEFDIR]) - - AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database]) --- -2.33.1 - diff --git a/2002-rasdaemon-log-non_standard_event-at-just-one-line.patch b/2002-rasdaemon-log-non_standard_event-at-just-one-line.patch deleted file mode 100644 index 1d85b12..0000000 --- a/2002-rasdaemon-log-non_standard_event-at-just-one-line.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 9e407134b86f7a176970be70121e08cac6cad3ff Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 7 Sep 2023 18:19:40 +0800 -Subject: [PATCH 2/4] rasdaemon: log non_standard_event at just one line - -It is more reasonable log non_standard_event in one line exclude errors -dump. So you can easily to get decoded non_standard_event log in one -line if you implement a decoder like other event. - -Signed-off-by: Ruidong Tian ---- - ras-non-standard-handler.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 6ccf5bc..7818ed8 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -174,7 +174,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, - case GHES_SEV_PANIC: - ev.severity = "Fatal"; - } -- trace_seq_printf(s, "\n %s", ev.severity); -+ trace_seq_printf(s, " %s", ev.severity); - - ev.sec_type = pevent_get_field_raw(s, event, "sec_type", - record, &len, 1); -@@ -185,7 +185,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, - trace_seq_printf(s, "\n section type: %s", - "Ampere Specific Error\n"); - else -- trace_seq_printf(s, "\n section type: %s", -+ trace_seq_printf(s, " section type: %s", - uuid_le(ev.sec_type)); - ev.fru_text = pevent_get_field_raw(s, event, "fru_text", - record, &len, 1); -@@ -198,7 +198,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, - if (pevent_get_field_val(s, event, "len", record, &val, 1) < 0) - return -1; - ev.length = val; -- trace_seq_printf(s, "\n length: %d\n", ev.length); -+ trace_seq_printf(s, " length: %d", ev.length); - - ev.error = pevent_get_field_raw(s, event, "buf", record, &len, 1); - if(!ev.error) --- -2.33.1 - diff --git a/2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch b/2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch deleted file mode 100644 index 519f4d7..0000000 --- a/2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch +++ /dev/null @@ -1,409 +0,0 @@ -From dbc5d5a9ba57ef3f84eb09c9ca658c96219a1736 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 7 Sep 2023 18:21:05 +0800 -Subject: [PATCH 3/4] rasdaemon: add support for THead Yitian non-standard - error decoder - -Add a new non-standard error decoder to decode THead YiTian error -section. Put all related code to a new source file. - -Signed-off-by: Ruidong Tian ---- - Makefile.am | 7 +- - configure.ac | 11 ++ - non-standard-yitian.c | 251 ++++++++++++++++++++++++++++++++++++++++++ - non-standard-yitian.h | 73 ++++++++++++ - 4 files changed, 341 insertions(+), 1 deletion(-) - create mode 100644 non-standard-yitian.c - create mode 100644 non-standard-yitian.h - -diff --git a/Makefile.am b/Makefile.am -index fabca78..7cbc81e 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -63,13 +63,18 @@ endif - if WITH_AMP_NS_DECODE - rasdaemon_SOURCES += non-standard-ampere.c - endif -+if WITH_YITIAN_NS_DECODE -+ rasdaemon_SOURCES += non-standard-yitian.c -+endif -+ - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ -- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h -+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -+ non-standard-yitian.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -diff --git a/configure.ac b/configure.ac -index 33b81fe..a02cca3 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], - AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes]) - AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) - -+AC_ARG_ENABLE([yitian_ns_decode], -+ AS_HELP_STRING([--enable-yitian-ns-decode], [enable YITIAN_NS_DECODE events (currently experimental)])) -+ -+AS_IF([test "x$enable_yitian_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], [ -+ AC_DEFINE(HAVE_YITIAN_NS_DECODE,1,"have YITIAN UNKNOWN_SEC events decode") -+ AC_SUBST([WITH_YITIAN_NS_DECODE]) -+]) -+AM_CONDITIONAL([WITH_YITIAN_NS_DECODE], [test x$enable_yitian_ns_decode = xyes || test x$enable_all == xyes]) -+AM_COND_IF([WITH_YITIAN_NS_DECODE], [USE_YITIAN_NS_DECODE="yes"], [USE_YITIAN_NS_DECODE="no"]) -+ - test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc - - CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -201,4 +211,5 @@ compile time options summary - Memory Failure : $USE_MEMORY_FAILURE - Memory CE PFA : $USE_MEMORY_CE_PFA - AMP RAS errors : $USE_AMP_NS_DECODE -+ YITIAN RAS errors : $USE_YITIAN_NS_DECODE - EOF -diff --git a/non-standard-yitian.c b/non-standard-yitian.c -new file mode 100644 -index 0000000..99cea47 ---- /dev/null -+++ b/non-standard-yitian.c -@@ -0,0 +1,251 @@ -+/* -+ * Copyright (C) 2023 Alibaba Inc -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+#include -+#include -+#include -+#include -+#include "ras-record.h" -+#include "ras-logger.h" -+#include "ras-report.h" -+#include "ras-non-standard-handler.h" -+#include "non-standard-yitian.h" -+ -+static const char * const yitian_ddr_payload_err_reg_name[] = { -+ "Error Type:", -+ "Error SubType:", -+ "Error Instance:", -+ "ECCCFG0:", -+ "ECCCFG1:", -+ "ECCSTAT:", -+ "ECCERRCNT:", -+ "ECCCADDR0:", -+ "ECCCADDR1:", -+ "ECCCSYN0:", -+ "ECCCSYN1:", -+ "ECCCSYN2:", -+ "ECCUADDR0:", -+ "ECCUADDR1:", -+ "ECCUSYN0:", -+ "ECCUSYN1:", -+ "ECCUSYN2:", -+ "ECCBITMASK0:", -+ "ECCBITMASK1:", -+ "ECCBITMASK2:", -+ "ADVECCSTAT:", -+ "ECCAPSTAT:", -+ "ECCCDATA0:", -+ "ECCCDATA1:", -+ "ECCUDATA0:", -+ "ECCUDATA1:", -+ "ECCSYMBOL:", -+ "ECCERRCNTCTL:", -+ "ECCERRCNTSTAT:", -+ "ECCERRCNT0:", -+ "ECCERRCNT1:", -+ "RESERVED0:", -+ "RESERVED1:", -+ "RESERVED2:", -+}; -+ -+struct yitian_ras_type_info { -+ int id; -+ const char *name; -+ const char * const *sub; -+ int sub_num; -+}; -+ -+static const struct yitian_ras_type_info yitian_payload_error_type[] = { -+ { -+ .id = YITIAN_RAS_TYPE_DDR, -+ .name = "DDR", -+ }, -+ { -+ } -+}; -+ -+#ifdef HAVE_SQLITE3 -+static const struct db_fields yitian_ddr_payload_fields[] = { -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "address", .type = "INTEGER" }, -+ { .name = "regs_dump", .type = "TEXT" }, -+}; -+ -+static const struct db_table_descriptor yitian_ddr_payload_section_tab = { -+ .name = "yitian_ddr_reg_dump_event", -+ .fields = yitian_ddr_payload_fields, -+ .num_fields = ARRAY_SIZE(yitian_ddr_payload_fields), -+}; -+ -+int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, -+ struct ras_yitian_ddr_payload_event *ev) -+{ -+ int rc; -+ struct sqlite3_stmt *stmt = ev_decoder->stmt_dec_record; -+ -+ log(TERM, LOG_INFO, "yitian_ddr_reg_dump_event store: %p\n", stmt); -+ -+ sqlite3_bind_text (stmt, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_int64 (stmt, 2, ev->address); -+ sqlite3_bind_text (stmt, 3, ev->reg_msg, -1, NULL); -+ -+ rc = sqlite3_step(stmt); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to do yitian_ddr_reg_dump_event step on sqlite: error = %d\n", rc); -+ rc = sqlite3_reset(stmt); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed reset yitian_ddr_reg_dump_event on sqlite: error = %d\n", rc); -+ log(TERM, LOG_INFO, "register inserted at db\n"); -+ -+ return rc; -+} -+#endif -+ -+static const char *oem_type_name(const struct yitian_ras_type_info *info, -+ uint8_t type_id) -+{ -+ const struct yitian_ras_type_info *type = &info[0]; -+ -+ for (; type->name; type++) { -+ if (type->id != type_id) -+ continue; -+ return type->name; -+ } -+ return "unknown"; -+} -+ -+static const char *oem_subtype_name(const struct yitian_ras_type_info *info, -+ uint8_t type_id, uint8_t sub_type_id) -+{ -+ const struct yitian_ras_type_info *type = &info[0]; -+ -+ for (; type->name; type++) { -+ const char * const *submodule = type->sub; -+ -+ if (type->id != type_id) -+ continue; -+ if (type->sub == NULL) -+ return type->name; -+ if (sub_type_id >= type->sub_num) -+ return "unknown"; -+ return submodule[sub_type_id]; -+ } -+ return "unknown"; -+} -+ -+void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, -+ struct trace_seq *s, -+ const struct yitian_ddr_payload_type_sec *err, -+ struct ras_events *ras) -+{ -+ char buf[1024]; -+ char *p = buf; -+ char *end = buf + 1024; -+ int i = 0; -+ const struct yitian_payload_header *header = &err->header; -+ uint32_t *pstart; -+ time_t now; -+ struct tm *tm; -+ struct ras_yitian_ddr_payload_event ev; -+ -+ const char *type_str = oem_type_name(yitian_payload_error_type, -+ header->type); -+ -+ const char *subtype_str = oem_subtype_name(yitian_payload_error_type, -+ header->type, header->subtype); -+ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -+ &yitian_ddr_payload_section_tab) != SQLITE_OK) { -+ trace_seq_printf(s, "create sql fail\n"); -+ return; -+ } -+ } -+#endif -+ -+ now = time(NULL); -+ tm = localtime(&now); -+ if (tm) -+ strftime(ev.timestamp, sizeof(ev.timestamp), -+ "%Y-%m-%d %H:%M:%S %z", tm); -+ //display error type -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " %s,", type_str); -+ -+ //display error subtype -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " %s,", subtype_str); -+ -+ //display error instance -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " 0x%x,", header->instance); -+ -+ //display reg dump -+ for (pstart = (uint32_t *)&err->ecccfg0; (void *)pstart < (void *)(err + 1); pstart += 1) { -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " 0x%x ", *pstart); -+ } -+ -+ if (p > buf && p < end) { -+ p--; -+ *p = '\0'; -+ } -+ -+ ev.reg_msg = malloc(p - buf + 1); -+ memcpy(ev.reg_msg, buf, p - buf + 1); -+ ev.address = 0; -+ -+ i = 0; -+ p = NULL; -+ end = NULL; -+ trace_seq_printf(s, "%s\n", buf); -+ -+#ifdef HAVE_SQLITE3 -+ record_yitian_ddr_reg_dump_event(ev_decoder, &ev); -+#endif -+ -+} -+ -+/* error data decoding functions */ -+static int decode_yitian710_ns_error(struct ras_events *ras, -+ struct ras_ns_ev_decoder *ev_decoder, -+ struct trace_seq *s, -+ struct ras_non_standard_event *event) -+{ -+ int payload_type = event->error[0]; -+ -+ if (payload_type == YITIAN_RAS_TYPE_DDR) { -+ const struct yitian_ddr_payload_type_sec *err = -+ (struct yitian_ddr_payload_type_sec *)event->error; -+ decode_yitian_ddr_payload_err_regs(ev_decoder, s, err, ras); -+ } else { -+ trace_seq_printf(s, "%s: wrong payload type\n", __func__); -+ return -1; -+ } -+ return 0; -+} -+ -+struct ras_ns_ev_decoder yitian_ns_oem_decoder[] = { -+ { -+ .sec_type = "a698081116ea4e4db936fb00a23ff29c", -+ .decode = decode_yitian710_ns_error, -+ }, -+}; -+ -+static void __attribute__((constructor)) yitian_ns_init(void) -+{ -+ int i; -+ for (i = 0; i < ARRAY_SIZE(yitian_ns_oem_decoder); i++) -+ register_ns_ev_decoder(&yitian_ns_oem_decoder[i]); -+} -diff --git a/non-standard-yitian.h b/non-standard-yitian.h -new file mode 100644 -index 0000000..b7d6a2d ---- /dev/null -+++ b/non-standard-yitian.h -@@ -0,0 +1,73 @@ -+/* -+ * Copyright (C) 2023 Alibaba Inc -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+ -+#ifndef __NON_STANDARD_YITIAN_H -+#define __NON_STANDARD_YITIAN_H -+ -+#include "ras-events.h" -+#include "libtrace/event-parse.h" -+ -+#define YITIAN_RAS_TYPE_DDR 0x50 -+ -+struct yitian_payload_header { -+ uint8_t type; -+ uint8_t subtype; -+ uint16_t instance; -+}; -+ -+struct yitian_ddr_payload_type_sec { -+ struct yitian_payload_header header; -+ uint32_t ecccfg0; -+ uint32_t ecccfg1; -+ uint32_t eccstat; -+ uint32_t eccerrcnt; -+ uint32_t ecccaddr0; -+ uint32_t ecccaddr1; -+ uint32_t ecccsyn0; -+ uint32_t ecccsyn1; -+ uint32_t ecccsyn2; -+ uint32_t eccuaddr0; -+ uint32_t eccuaddr1; -+ uint32_t eccusyn0; -+ uint32_t eccusyn1; -+ uint32_t eccusyn2; -+ uint32_t eccbitmask0; -+ uint32_t eccbitmask1; -+ uint32_t eccbitmask2; -+ uint32_t adveccstat; -+ uint32_t eccapstat; -+ uint32_t ecccdata0; -+ uint32_t ecccdata1; -+ uint32_t eccudata0; -+ uint32_t eccudata1; -+ uint32_t eccsymbol; -+ uint32_t eccerrcntctl; -+ uint32_t eccerrcntstat; -+ uint32_t eccerrcnt0; -+ uint32_t eccerrcnt1; -+ uint32_t reserved0; -+ uint32_t reserved1; -+ uint32_t reserved2; -+}; -+ -+struct ras_yitian_ddr_payload_event { -+ char timestamp[64]; -+ unsigned long long address; -+ char *reg_msg; -+}; -+ -+int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, -+ struct ras_yitian_ddr_payload_event *ev); -+void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, -+ struct trace_seq *s, -+ const struct yitian_ddr_payload_type_sec *err, -+ struct ras_events *ras); -+#endif --- -2.33.1 - diff --git a/2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch b/2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch deleted file mode 100644 index b508066..0000000 --- a/2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 2e30517b9584ee8ae99553400168e07afce8ff9c Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 7 Sep 2023 18:22:06 +0800 -Subject: [PATCH 4/4] rasdaemon: ras-mc-ctl: Add support to display the THead - vendor errors - -Add support for the THead YiTian DDRC register dump event. - -Signed-off-by: Ruidong Tian ---- - util/ras-mc-ctl.in | 43 +++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 43 insertions(+) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 1e3aeb7..d30fca4 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1528,6 +1528,7 @@ sub errors - use constant { - HISILICON_KUNPENG_920 => "Kunpeng920", - HISILICON_KUNPENG_9XX => "Kunpeng9xx", -+ THEAD_YITIAN_7XX => "YiTian7XX", - }; - - sub vendor_errors_summary -@@ -1536,6 +1537,7 @@ sub vendor_errors_summary - my ($num_args, $platform_id); - my ($query, $query_handle, $count, $out); - my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info); -+ my ($address); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1628,6 +1630,24 @@ sub vendor_errors_summary - $query_handle->finish; - } - -+ # THead Yitian710 DDR errors -+ if ($platform_id eq THEAD_YITIAN_7XX) { -+ $query = "select address, count(*) from yitian_ddr_reg_dump_event"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($address, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\terrors: $count"; -+ } -+ if ($out ne "") { -+ print "THead YiTian710 DDR error dump events summary:\n$out\n"; -+ } else { -+ print "No THead YiTian710 DDR error dump errors.\n\n"; -+ } -+ $query_handle->finish; -+ } -+ - undef($dbh); - } - -@@ -1638,6 +1658,7 @@ sub vendor_errors - my ($query, $query_handle, $id, $timestamp, $out); - my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id); - my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs); -+ my ($address, $regs_dump); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1743,6 +1764,27 @@ sub vendor_errors - $query_handle->finish; - } - -+ # THead Yitian7xx ddr errors -+ if ($platform_id eq THEAD_YITIAN_7XX) { -+ $query = "select id, timestamp, address, regs_dump from yitian_ddr_reg_dump_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $address, $regs_dump)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id. $timestamp "; -+ $out .= "Error Address: $address "; -+ $out .= "Error Registers Dump: $regs_dump" if ($regs_dump); -+ $out .= "\n\n"; -+ } -+ if ($out ne "") { -+ print "THead Yitian710 DDRC error events:\n$out\n"; -+ } else { -+ print "No THead Yitian710 DDRC error events.\n"; -+ } -+ $query_handle->finish; -+ } -+ - undef($dbh); - } - -@@ -1751,6 +1793,7 @@ sub vendor_platforms - print "\nSupported platforms for the vendor-specific errors:\n"; - print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n"; - print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; -+ print "\tTHead Yitian7xx, platform-id=\"", THEAD_YITIAN_7XX, "\"\n"; - print "\n"; - } - --- -2.33.1 - diff --git a/2d15882a0cbfce0b905039bebc811ac8311cd739.patch b/2d15882a0cbfce0b905039bebc811ac8311cd739.patch new file mode 100644 index 0000000..1791705 --- /dev/null +++ b/2d15882a0cbfce0b905039bebc811ac8311cd739.patch @@ -0,0 +1,105 @@ +commit 2d15882a0cbfce0b905039bebc811ac8311cd739 +Author: Muralidhara M K +Date: Fri Jun 30 11:19:42 2023 +0000 + + rasdaemon: Handle reassigned bit definitions for UMC bank + + On some AMD systems some of the existing bit definitions in the + CTL register of SMCA bank type are reassigned without defining + new HWID and McaType. Consequently, the errors whose bit + definitions have been reassigned in the CTL register are being + erroneously decoded. + + Add new error description structure to compensate for the + reassigned bit definitions, by new software defined SMCA bank + type by utilizing the hardware-reserved values for HWID. + The new SMCA bank type will only be employed for UMC error + decoding on affected models and the existing error description + structure for UMC bank type is still valid. + + Signed-off-by: Muralidhara M K + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index fc51b5a..54060ee 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -60,6 +60,7 @@ enum smca_bank_types { + SMCA_CS_V2_QUIRK, + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ ++ SMCA_UMC_QUIRK, + SMCA_UMC_V2, + SMCA_MA_LLC, /* Memory Attached Last Level Cache */ + SMCA_PB, /* Parameter Block */ +@@ -313,6 +314,25 @@ static const char * const smca_umc_mce_desc[] = { + "Read CRC Error", + }; + ++static const char * const smca_umc_quirk_mce_desc[] = { ++ "DRAM On Die ECC error", ++ "Data poison error", ++ "SDP parity error", ++ "Reserved", ++ "Address/Command parity error", ++ "HBM Write data parity error", ++ "Consolidated SRAM ECC error", ++ "Reserved", ++ "Reserved", ++ "Rdb SRAM ECC error", ++ "Thermal throttling", ++ "HBM Read Data Parity error", ++ "Reserved", ++ "UMC FW Error", ++ "SRAM Parity Error", ++ "HBM CRC Error", ++}; ++ + static const char * const smca_umc2_mce_desc[] = { + "DRAM ECC error", + "Data poison error", +@@ -642,6 +662,7 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)}, + [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, + [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, ++ [SMCA_UMC_QUIRK] = { smca_umc_quirk_mce_desc, ARRAY_SIZE(smca_umc_quirk_mce_desc) }, + [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, + [SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) }, + [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, +@@ -696,6 +717,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, 0x00000096 }, ++ { SMCA_UMC_QUIRK, 0x00020000 }, + /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */ + { SMCA_UMC_V2, 0x00010096 }, + /* Memory Attached Last Level Cache */ +@@ -764,7 +786,7 @@ static struct smca_bank_name smca_names[] = { + [SMCA_L3_CACHE] = { "L3 Cache" }, + [SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" }, + [SMCA_PIE] = { "Power, Interrupts, etc." }, +- [SMCA_UMC] = { "Unified Memory Controller" }, ++ [SMCA_UMC ... SMCA_UMC_QUIRK] = { "Unified Memory Controller" }, + [SMCA_UMC_V2] = { "Unified Memory Controller V2" }, + [SMCA_MA_LLC] = { "Memory Attached Last Level Cache" }, + [SMCA_PB] = { "Parameter Block" }, +@@ -843,6 +865,10 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype) + if (*hwid_mcatype == 0x0002002E) + *hwid_mcatype = 0x00010000; + break; ++ case 0x90 ... 0x9F: ++ if ((*hwid_mcatype & 0xFF) == 0x00000096) ++ *hwid_mcatype = 0x00020000; ++ break; + default: + break; + } +@@ -908,7 +934,7 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m) + smca_mce_descs[bank_type].descs[xec], + xec); + +- if (bank_type == SMCA_UMC && xec == 0) { ++ if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) { + channel = find_umc_channel(e); + csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */ + mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", diff --git a/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch b/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch deleted file mode 100644 index 908333a..0000000 --- a/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch +++ /dev/null @@ -1,660 +0,0 @@ -From 0fd49ba8f1af285c7f607b3c8a669942631fd259 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Fri, 7 Jun 2024 11:26:06 +0800 -Subject: [PATCH 1/6] rasdaemon: add mc_event and mce_record trigger - -Allow users to run a trigger when mc_event and mce_record occurs, The -trigger is separated into CE trigger and UE trigger, this is because -CE is more frequent than UE, and the CE trigger will lead to more -performance hits. Users can choose different triggers for CE/UE to -reduce this effect. - -To prevent triggering hangs or consuming excessive time, there is a -default timeout of 1s, trigger will be killed if timeout, user can -modify timeout by setting environment *_TIMEOUT or delete timeout by -setting *_TIMEOUT to 0. - -Environment of trigger in /etc/sysconfig/rasdaemon: - -TRIGGER_DIR: The trigger diretory - -MC_CE_TRIGGER: The script executed when corrected mc_event occurs. -MC_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_CE_TRIGGER, set 0 to -delete timeout. -MC_UE_TRIGGER: The script executed when uncorrected mc_event occurs. -MC_UE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_UE_TRIGGER, set 0 to -delete timeout. - -MCE_CE_TRIGGER: The script executed when corrected mce_record occurs. -MCE_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MCE_CE_TRIGGER, set 0 to -delete timeout. -MCE_UE_TRIGGER: The script executed when uncorrected mce_record occurs. -MCE_UE_TRIGGER_TIMEOUT: Timeout(seconds) for MCE_UE_TRIGGER, set 0 to -delete timeout. - -No script will be executed if *_CE_TRIGGER/*_UE_TRIGGER is null. - -Signed-off-by: Ruidong Tian ---- - Makefile.am | 8 +- - contrib/mc_event_trigger | 24 +++++ - contrib/mce_record_trigger | 36 ++++++++ - misc/rasdaemon.env | 31 +++++++ - ras-events.c | 3 + - ras-mc-handler.c | 63 +++++++++++++ - ras-mce-handler.c | 89 ++++++++++++++++++ - trigger.c | 184 +++++++++++++++++++++++++++++++++++++ - trigger.h | 29 ++++++ - 9 files changed, 463 insertions(+), 4 deletions(-) - create mode 100755 contrib/mc_event_trigger - create mode 100755 contrib/mce_record_trigger - create mode 100644 trigger.c - create mode 100644 trigger.h - -diff --git a/Makefile.am b/Makefile.am -index f410c6d..2e4fe39 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -17,7 +17,7 @@ all-local: $(SYSTEMD_SERVICES) - - sbin_PROGRAMS = rasdaemon - rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ -- bitfield.c -+ bitfield.c trigger.c - if WITH_SQLITE3 - rasdaemon_SOURCES += ras-record.c - endif -@@ -74,7 +74,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ - non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -- non-standard-yitian.h -+ non-standard-yitian.h trigger.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -@@ -101,6 +101,6 @@ upload: - # custom target - install-data-local: - $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" --if WITH_MEMORY_CE_PFA -+ $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" - $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" --endif -+ $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" -diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger -new file mode 100755 -index 0000000..5c6ccfa ---- /dev/null -+++ b/contrib/mc_event_trigger -@@ -0,0 +1,24 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon in daemon mode when a -+# mc_event is occured, environment variables include all information -+# reported by tracepoint. -+# -+# environment: -+# TIMESTAMP Timestamp when error occurred -+# COUNT Number of errors of the same type -+# TYPE Error type from Corrected/Uncorrected -+# MESSAGE Error message -+# LABEL Label of the affected DIMM(s) -+# MC_INDEX DIMM identifier from DMI/SMBIOS if available -+# TOP_LAYER Top layer of the error -+# MIDDLE_LAYER Middle layer of the error -+# LOWER_LAYER Low layer of the error -+# ADDRESS Error address -+# GRAIN Minimum granularity for an error report, in bytes -+# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable) -+# DRIVER_DETAIL Other driver-specific detail about the error -+# -+ -+[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local -+ -+exit 0 -diff --git a/contrib/mce_record_trigger b/contrib/mce_record_trigger -new file mode 100755 -index 0000000..06a52d9 ---- /dev/null -+++ b/contrib/mce_record_trigger -@@ -0,0 +1,36 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon in daemon mode when a -+# mc_event is occured, environment variables include all information -+# reported by tracepoint. -+# -+# environment: -+# MCGCAP MCGCAP MSR: machine check capabilities of CPU -+# MCGSTATUS Machine Check Global Status MSR -+# STATUS Bank's MCi_STATUS MSR -+# ADDR Bank's MCi_ADDR MSR -+# MISC Bank's MCi_MISC MSR -+# IP Instruction Pointer when the error happened -+# TSC CPU time stamp counter -+# WALLTIME Wall time_t when error was detected -+# CPU CPU number; obsoleted by extcpu -+# CPUID CPUID 1 EAX -+# APICID CPU initial APIC ID -+# SOCKETID CPU socket ID -+# CS Code segment -+# BANK Machine check bank reporting the error -+# CPUVENDOR Kernel's X86_VENDOR enum -+# SYND MCA_SYND MSR: only valid on SMCA systems -+# IPID MCA_IPID MSR: only valid on SMCA systems -+# TIMESTAMP Rasdaemon timestamp -+# BANK_NAME Decode ban name -+# ERROR_MSG Vendor define error message -+# MCGSTATUS_MSG Decode mcgstatus -+# MCISTATUS_MSG Decode mcistatus -+# MCASTATUS_MSG Decode mcastatus -+# USER_ACTION Recommendations for actions users should take -+# MC_LOCATION Error location in MC -+# -+ -+[ -x ./mce_record_trigger.local ] && . ./mce_record_trigger.local -+ -+exit 0 -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 713875a..9f8e606 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -31,3 +31,34 @@ PAGE_CE_ACTION="soft" - # Notices script when doing memory offline - PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" - PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" -+ -+# Event Trigger -+ -+# Event trigger will be executed when the specified event occurs. -+# -+# Execute triggers path -+# For example: TRIGGER_DIR=/etc/ras/triggers -+TRIGGER_DIR= -+ -+# Execute these triggers when the mc_event occured, the triggers will not -+# be executed if the trigger is not specified. -+# You can set timeout for trigger, trigger thread will be killed if timeout. -+# The default timeout is 1, if you do not want any timeout, set it to 0. -+# For example: -+# MC_CE_TRIGGER=mc_event_trigger -+# MC_UE_TRIGGER=mc_event_trigger -+# MC_CE_TRIGGER_TIMEOUT=1 -+# MC_UE_TRIGGER_TIMEOUT=1 -+ -+# trigger for mc_event -+MC_CE_TRIGGER= -+MC_UE_TRIGGER= -+MC_CE_TRIGGER_TIMEOUT=0 -+MC_UE_TRIGGER_TIMEOUT=0 -+ -+# trigger for mce_record -+MCE_CE_TRIGGER= -+MCE_UE_TRIGGER= -+MCE_CE_TRIGGER_TIMEOUT=0 -+MCE_UE_TRIGGER_TIMEOUT=0 -+ -diff --git a/ras-events.c b/ras-events.c -index fe4bd26..016f531 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -41,6 +41,7 @@ - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-page-isolation.h" -+#include "trigger.h" - - /* - * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -815,6 +816,8 @@ int handle_ras_events(int record_events) - ras_page_account_init(); - #endif - -+ trigger_setup(); -+ - rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", - ras_mc_event_handler, NULL, MC_EVENT); - if (!rc) -diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index 42b05cd..0081d95 100644 ---- a/ras-mc-handler.c -+++ b/ras-mc-handler.c -@@ -15,16 +15,73 @@ - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -+#define _GNU_SOURCE - #include - #include - #include - #include - #include "libtrace/kbuffer.h" -+#include - #include "ras-mc-handler.h" - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-page-isolation.h" - #include "ras-report.h" -+#include "trigger.h" -+ -+struct event_trigger mc_ce_trigger = {"mc_event", "MC_CE_TRIGGER"}; -+struct event_trigger mc_ue_trigger = {"mc_event", "MC_UE_TRIGGER"}; -+ -+static void run_mc_trigger(struct ras_mc_event *ev, -+ struct event_trigger *trigger, -+ struct trace_seq *s) -+{ -+ char *env[MAX_ENV]; -+ int ei = 0, i; -+ char msg[4096]; -+ -+ if (!trigger->path || !strcmp(trigger->path, "")) -+ return; -+ -+ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) -+ goto free; -+ env[ei] = NULL; -+ assert(ei < MAX_ENV); -+ -+ run_trigger(trigger, NULL, env, "mc_event", msg); -+ -+ trace_seq_printf(s, " %s", msg); -+ -+free: -+ for (i = 0; i < ei; i++) -+ free(env[i]); -+} - - int ras_mc_event_handler(struct trace_seq *s, - struct pevent_record *record, -@@ -195,6 +252,12 @@ int ras_mc_event_handler(struct trace_seq *s, - ras_report_mc_event(ras, &ev); - #endif - -+ if (!strcmp(ev.error_type, "Corrected")) -+ run_mc_trigger(&ev, &mc_ce_trigger, s); -+ -+ if (!strcmp(ev.error_type, "Uncorrected")) -+ run_mc_trigger(&ev, &mc_ue_trigger, s); -+ - return 0; - - parse_error: -diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index 805004a..ac2c4a1 100644 ---- a/ras-mce-handler.c -+++ b/ras-mce-handler.c -@@ -15,6 +15,7 @@ - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -+#define _GNU_SOURCE - #include - #include - #include -@@ -22,11 +23,13 @@ - #include - #include - #include -+#include - #include "libtrace/kbuffer.h" - #include "ras-mce-handler.h" - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-report.h" -+#include "trigger.h" - - /* - * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, -@@ -233,6 +236,85 @@ ret: - return ret; - } - -+struct event_trigger mce_ce_trigger = {"mce_record", "MCE_CE_TRIGGER"}; -+struct event_trigger mce_de_trigger = {"mce_record", "MCE_DE_TRIGGER"}; -+struct event_trigger mce_ue_trigger = {"mce_record", "MCE_UE_TRIGGER"}; -+ -+static void run_mce_trigger(struct mce_event *e, -+ struct event_trigger *trigger, -+ struct trace_seq *s) -+{ -+ char *env[MAX_ENV]; -+ int ei = 0, i; -+ char msg[4096]; -+ -+ if (!trigger->path || !strcmp(trigger->path, "")) -+ return; -+ -+ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MCGCAP=%#lx", e->mcgcap) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MCGSTATUS=%#lx", e->mcgstatus) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "STATUS=%#lx", e->status) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "ADDR=%#lx", e->addr) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MISC=%#lx", e->misc) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "IP=%#lx", e->ip) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TSC=%#lx", e->tsc) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "WALLTIME=%#lx", e->walltime) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "CPU=%#x", e->cpu) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "CPUID=%#x", e->cpuid) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "APICID=%#x", e->apicid) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "SOCKETID=%#x", e->socketid) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "CS=%#x", e->cs) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "BANK=%#x", e->bank) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "CPUVENDOR=%#x", e->cpuvendor) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "SYND=%#lx", e->synd) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "IPID=%#lx", e->ipid) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TIMESTAMP=%s", e->timestamp) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "BANK_NAME=%s", e->bank_name) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "ERROR_MSG=%s", e->error_msg) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MCGSTATUS_MSG=%s", e->mcgstatus_msg) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MCISTATUS_MSG=%s", e->mcistatus_msg) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MCASTATUS_MSG=%s", e->mcastatus_msg) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "USER_ACTION=%s", e->user_action) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MC_LOCATION=%s", e->mc_location) < 0) -+ goto free; -+ env[ei] = NULL; -+ assert(ei < MAX_ENV); -+ -+ run_trigger(trigger, NULL, env, "mce_record", msg); -+ -+ trace_seq_printf(s, " %s", msg); -+ -+free: -+ for (i = 0; i < ei; i++) -+ free(env[i]); -+} -+ - int register_mce_handler(struct ras_events *ras, unsigned ncpus) - { - int rc; -@@ -480,5 +562,12 @@ int ras_mce_event_handler(struct trace_seq *s, - ras_report_mce_event(ras, &e); - #endif - -+ if (e.status & MCI_STATUS_UC) -+ run_mce_trigger(&e, &mce_ue_trigger, s); -+ else if (e.status & MCI_STATUS_DEFERRED) -+ run_mce_trigger(&e, &mce_de_trigger, s); -+ else -+ run_mce_trigger(&e, &mce_ce_trigger, s); -+ - return 0; - } -diff --git a/trigger.c b/trigger.c -new file mode 100644 -index 0000000..8716f50 ---- /dev/null -+++ b/trigger.c -@@ -0,0 +1,184 @@ -+#define _GNU_SOURCE -+#include -+#include -+#include -+#include -+#include -+#include "ras-logger.h" -+#include "trigger.h" -+#include -+ -+#define READ 0 -+#define WRITE 1 -+ -+static int child_done, alarm_done; -+static char *trigger_dir; -+ -+static void child_handler(int sig) -+{ -+ child_done = 1; -+} -+ -+static void alarm_handler(int sig) -+{ -+ alarm_done = 1; -+} -+ -+void run_trigger(struct event_trigger *t, char *argv[], char **env, -+ const char* reporter, char *msg) -+{ -+ pid_t child; -+ char *path, err[256] = {0}, *trigger = t->path; -+ int status, pipe_stdout[2], pipe_stderr[2], timeout = t->timeout; -+ ssize_t byte = 0; -+ -+ log(TERM, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); -+ -+ if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0) -+ return; -+ -+ if (pipe(pipe_stdout) == -1) -+ exit(EXIT_FAILURE); -+ -+ if (pipe(pipe_stderr) == -1) -+ exit(EXIT_FAILURE); -+ -+ child = fork(); -+ if (child < 0) { -+ log(TERM, LOG_ERR, "Cannot create process for trigger"); -+ return; -+ } else if (child == 0) { -+ close(pipe_stdout[READ]); -+ close(pipe_stderr[READ]); -+ dup2(pipe_stdout[WRITE], 1); -+ dup2(pipe_stderr[WRITE], 2); -+ close(pipe_stdout[WRITE]); -+ close(pipe_stderr[WRITE]); -+ -+ execve(path, argv, env); -+ exit(EXIT_FAILURE); -+ } -+ -+ signal(SIGCHLD, child_handler); -+ -+ close(pipe_stdout[WRITE]); -+ close(pipe_stderr[WRITE]); -+ -+ if (timeout) { -+ signal(SIGALRM, alarm_handler); -+ alarm(timeout); -+ } -+ -+ pause(); -+ -+ if (child_done) { -+ if (waitpid(child, &status, WNOHANG) == child){ -+ if (WIFEXITED(status) && WEXITSTATUS(status)) -+ log(TERM, LOG_INFO, "Trigger %s exited with status %d\n", -+ trigger, WEXITSTATUS(status)); -+ else if (WIFSIGNALED(status)) -+ log(TERM, LOG_INFO, "Trigger %s killed by signal %d\n", -+ trigger, WTERMSIG(status)); -+ } -+ alarm(0); -+ } else if (alarm_done) { -+ log(TERM, LOG_ERR, "Trigger timeout, kill it\n"); -+ kill(child, SIGKILL); -+ } -+ signal(SIGCHLD, SIG_DFL); -+ signal(SIGALRM, SIG_DFL); -+ -+ byte = read(pipe_stderr[READ], err, 256); -+ if (byte > 0) -+ log(TERM, LOG_ERR, "Trigger stderr: %s\n", err); -+ else if (byte < 0) -+ log(TERM, LOG_ERR, "Trigger error : %s\n", strerror(byte)); -+ -+ byte = read(pipe_stdout[READ], msg, 4096); -+ if (byte < 0) -+ log(TERM, LOG_ERR, "Trigger error : %s\n", strerror(byte)); -+ -+ close(pipe_stdout[READ]); -+ close(pipe_stderr[READ]); -+} -+ -+int trigger_check(char *s) -+{ -+ char *name; -+ int rc; -+ -+ if (trigger_dir) { -+ if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) -+ return -1; -+ } else -+ name = s; -+ -+ rc = access(name, R_OK|X_OK); -+ -+ if (trigger_dir) -+ free(name); -+ -+ return rc; -+} -+ -+static struct event_trigger *event_triggers[] = { -+ &mc_ce_trigger, -+ &mc_ue_trigger, -+#ifdef HAVE_MCE -+ &mce_ce_trigger, -+ &mce_de_trigger, -+ &mce_ue_trigger, -+#endif -+}; -+ -+void trigger_setup(void) -+{ -+ int i, j; -+ struct event_trigger *trigger; -+ char *s, timeout_env[30]; -+ -+ trigger_dir = getenv("TRIGGER_DIR"); -+ -+ for (i = 0; i < ARRAY_SIZE(event_triggers); i++) { -+ trigger = event_triggers[i]; -+ -+ s = getenv(trigger->env); -+ if (!s || !strcmp(s, "")) -+ continue; -+ -+ trigger->path = s; -+ if (trigger_check(s) < 0) -+ log(SYSLOG, LOG_ERR, "Cannot access trigger `%s`\n", s); -+ else -+ log(SYSLOG, LOG_NOTICE, "Setup %s trigger `%s`\n", -+ trigger->event_name, s); -+ -+ sprintf(timeout_env, "%s_TIMEOUT", trigger->env); -+ -+ trigger->timeout = 1; -+ s = getenv(timeout_env); -+ if (!s || !strcmp(s, "")) { -+ log(SYSLOG, LOG_NOTICE, -+ "Setup %s trigger default timeout 1s", -+ trigger->event_name); -+ continue; -+ } -+ -+ j = atoi(s); -+ if (j < 0) -+ log(SYSLOG, LOG_ERR, -+ "Invalid %s trigger timeout `%d`" -+ "use default value: 1s\n", -+ trigger->event_name, j); -+ else if (j == 0) { -+ log(SYSLOG, LOG_NOTICE, -+ "%s trigger no timeout\n", trigger->event_name); -+ trigger->timeout = 0; -+ } else { -+ log(SYSLOG, LOG_NOTICE, -+ "Setup %s trigger timeout `%d`s\n", -+ trigger->event_name, j); -+ trigger->timeout = j; -+ } -+ } -+} -diff --git a/trigger.h b/trigger.h -new file mode 100644 -index 0000000..8a6e380 ---- /dev/null -+++ b/trigger.h -@@ -0,0 +1,29 @@ -+#ifndef __TRIGGER_H__ -+#define __TRIGGER_H__ -+ -+#include "config.h" -+ -+#define MAX_ENV 30 -+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) -+ -+struct event_trigger { -+ const char *event_name; -+ const char *env; -+ char *path; -+ int timeout; -+}; -+ -+int trigger_check(char *s); -+void run_trigger(struct event_trigger *t, char *argv[], char **env, -+ const char* reporter, char *msg); -+void trigger_setup(void); -+ -+extern struct event_trigger mc_ce_trigger; -+extern struct event_trigger mc_ue_trigger; -+#ifdef HAVE_MCE -+extern struct event_trigger mce_ce_trigger; -+extern struct event_trigger mce_de_trigger; -+extern struct event_trigger mce_ue_trigger; -+#endif -+ -+#endif --- -2.33.1 - diff --git a/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch b/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch deleted file mode 100644 index e2af5ee..0000000 --- a/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 248531d736be425ea1a767def8176e04bac3d819 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Tue, 12 Dec 2023 10:46:11 +0800 -Subject: [PATCH 2/6] rasdaemon: Do't process Ampere specific error in the - public code - -Ampere specific error info and error handler need to included in -HAVE_AMP_NS_DECODE macro. - -Signed-off-by: Ruidong Tian ---- - ras-arm-handler.c | 7 +++---- - ras-record.c | 4 ++++ - ras-record.h | 2 ++ - 3 files changed, 9 insertions(+), 4 deletions(-) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 1149dc6..d81daec 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -51,7 +51,6 @@ int ras_arm_event_handler(struct trace_seq *s, - time_t now; - struct tm *tm; - struct ras_arm_event ev; -- int len = 0; - memset(&ev, 0, sizeof(ev)); - - /* -@@ -99,6 +98,9 @@ int ras_arm_event_handler(struct trace_seq *s, - ev.psci_state = val; - trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); - -+#ifdef HAVE_AMP_NS_DECODE -+ int len = 0; -+ - if (pevent_get_field_val(s, event, "pei_len", record, &val, 1) < 0) - return -1; - ev.pei_len = val; -@@ -131,12 +133,9 @@ int ras_arm_event_handler(struct trace_seq *s, - if (!ev.vsei_error) - return -1; - --#ifdef HAVE_AMP_NS_DECODE - //decode ampere specific error - decode_amp_payload0_err_regs(NULL, s, - (struct amp_payload0_type_sec *)ev.vsei_error); --#else -- display_raw_data(s, ev.vsei_error, ev.oem_len); - #endif - - /* Insert data into the SGBD */ -diff --git a/ras-record.c b/ras-record.c -index d845f81..04ad094 100644 ---- a/ras-record.c -+++ b/ras-record.c -@@ -212,9 +212,11 @@ static const struct db_fields arm_event_fields[] = { - { .name="mpidr", .type="INTEGER" }, - { .name="running_state", .type="INTEGER" }, - { .name="psci_state", .type="INTEGER" }, -+#ifdef HAVE_AMP_NS_DECODE - { .name="err_info", .type="BLOB" }, - { .name="context_info", .type="BLOB" }, - { .name="vendor_info", .type="BLOB" }, -+#endif - }; - - static const struct db_table_descriptor arm_event_tab = { -@@ -238,12 +240,14 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) - sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr); - sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state); - sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state); -+#ifdef HAVE_AMP_NS_DECODE - sqlite3_bind_blob (priv->stmt_arm_record, 7, - ev->pei_error, ev->pei_len, NULL); - sqlite3_bind_blob (priv->stmt_arm_record, 8, - ev->ctx_error, ev->ctx_len, NULL); - sqlite3_bind_blob (priv->stmt_arm_record, 9, - ev->vsei_error, ev->oem_len, NULL); -+#endif - - rc = sqlite3_step(priv->stmt_arm_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) -diff --git a/ras-record.h b/ras-record.h -index d9f7733..86678b2 100644 ---- a/ras-record.h -+++ b/ras-record.h -@@ -77,12 +77,14 @@ struct ras_arm_event { - int64_t midr; - int32_t running_state; - int32_t psci_state; -+#ifdef HAVE_AMP_NS_DECODE - const uint8_t *pei_error; - uint32_t pei_len; - const uint8_t *ctx_error; - uint32_t ctx_len; - const uint8_t *vsei_error; - uint32_t oem_len; -+#endif - }; - - struct devlink_event { --- -2.33.1 - diff --git a/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch b/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch deleted file mode 100644 index 229e0c4..0000000 --- a/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch +++ /dev/null @@ -1,56 +0,0 @@ -From dce53f6809c4fdab967ecc78f80c8ec2ebd89aca Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Wed, 20 Oct 2021 14:33:37 +0800 -Subject: [PATCH 3/6] rasdaemon: Fix the issue of sprintf data type mismatch in - uuid_le() - -The data type of sprintf called in the function uuid_le() is mismatch. -Arm64 compiler force it to unsigned char by default, and can work normally. -But if someone compile it with the option -fsigned-char, the function -can't work correctly. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - ras-extlog-handler.c | 2 +- - ras-non-standard-handler.c | 4 ++-- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c -index 5fd3580..1834687 100644 ---- a/ras-extlog-handler.c -+++ b/ras-extlog-handler.c -@@ -152,7 +152,7 @@ static char *uuid_le(const char *uu) - static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; - - for (i = 0; i < 16; i++) { -- p += sprintf(p, "%.2x", uu[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); - switch (i) { - case 3: - case 5: -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 7818ed8..86178bf 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -36,7 +36,7 @@ static char *uuid_le(const char *uu) - static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; - - for (i = 0; i < 16; i++) { -- p += sprintf(p, "%.2x", uu[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); - switch (i) { - case 3: - case 5: -@@ -61,7 +61,7 @@ static int uuid_le_cmp(const char *sec_type, const char *uuid2) - 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; - - for (i = 0; i < 16; i++) -- p += sprintf(p, "%.2x", sec_type[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); - *p = 0; - return strncmp(uuid1, uuid2, 32); - } --- -2.33.1 - diff --git a/3004-rasdaemon-ensure-trace_clock-file-exist.patch b/3004-rasdaemon-ensure-trace_clock-file-exist.patch deleted file mode 100644 index eed9c40..0000000 --- a/3004-rasdaemon-ensure-trace_clock-file-exist.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 5cfecb69e04d964d4f71f4ccd2a6ce1fc2690f78 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 30 May 2024 19:13:21 +0800 -Subject: [PATCH 4/6] rasdaemon: ensure trace_clock file exist - -Fix https://github.com/mchehab/rasdaemon/issues/74 - -Signed-off-by: Ruidong Tian ---- - ras-events.c | 16 +++++++++++++++- - 1 file changed, 15 insertions(+), 1 deletion(-) - -diff --git a/ras-events.c b/ras-events.c -index 016f531..544c418 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -98,6 +98,18 @@ static int get_debugfs_dir(char *tracing_dir, size_t len) - return ENOENT; - } - -+static int stat_trace(struct ras_events *ras, char *name) -+{ -+ char fname[MAX_PATH + 1]; -+ struct stat file_info; -+ -+ strcpy(fname, ras->tracing); -+ strcat(fname, "/"); -+ strcat(fname, name); -+ -+ return stat(fname, &file_info); -+} -+ - static int open_trace(struct ras_events *ras, char *name, int flags) - { - char fname[MAX_PATH + 1]; -@@ -619,12 +631,14 @@ static void *handle_ras_events_cpu(void *priv) - static int select_tracing_timestamp(struct ras_events *ras) - { - FILE *fp; -- int fd, rc; -+ int fd, rc, retry = 10; - time_t uptime, now; - size_t size; - unsigned j1; - char buf[4096]; - -+ while (stat_trace(ras, "trace_clock") && retry--); -+ - /* Check if uptime is supported (kernel 3.10-rc1 or upper) */ - fd = open_trace(ras, "trace_clock", O_RDONLY); - if (fd < 0) { --- -2.33.1 - diff --git a/3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch b/3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch deleted file mode 100644 index 6b9435d..0000000 --- a/3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch +++ /dev/null @@ -1,33 +0,0 @@ -From 5befe6d0f28971c8bde2302b535c80957718ef30 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 6 Jun 2024 14:22:05 +0800 -Subject: [PATCH 5/6] rasdaemon: mce_record print just one line on AMD - -AMD SMCA will print 2 line for mce_record, like: - - <...>-1106 [002] 0.010002: mce_record: ... - Memory Error 'mem-tx... - -Delete '\n' in amd smca decoder to print just oneline - -Signed-off-by: Ruidong Tian ---- - mce-amd-smca.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 7c619fd..8291c3a 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -481,7 +481,7 @@ static void decode_smca_error(struct mce_event *e) - /* Only print the descriptor of valid extended error code */ - if (xec < smca_mce_descs[bank_type].num_descs) - mce_snprintf(e->mcastatus_msg, -- " %s.\n", smca_mce_descs[bank_type].descs[xec]); -+ " %s. ", smca_mce_descs[bank_type].descs[xec]); - - if (bank_type == SMCA_UMC && xec == 0) { - channel = find_umc_channel(e); --- -2.33.1 - diff --git a/3006-rasdaemon-disable-ce-offline-default.patch b/3006-rasdaemon-disable-ce-offline-default.patch deleted file mode 100644 index 6fec9fd..0000000 --- a/3006-rasdaemon-disable-ce-offline-default.patch +++ /dev/null @@ -1,48 +0,0 @@ -From ab8f363f4ffcbc49bf700ca0199ff2b8f9bba65a Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Fri, 28 Jun 2024 10:06:40 +0800 -Subject: [PATCH] rasdaemon: disable ce offline default - -Signed-off-by: Ruidong Tian ---- - misc/rasdaemon.env | 8 +++++--- - ras-page-isolation.c | 2 +- - 2 files changed, 6 insertions(+), 4 deletions(-) - -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 9f8e606..1b5403c 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -26,11 +26,13 @@ PAGE_CE_THRESHOLD="50" - # Requires an uptodate kernel. Might not be successfull. - # soft-then-hard First try to soft offline, then try hard offlining. - # Note: default offline choice is "soft". --PAGE_CE_ACTION="soft" -+PAGE_CE_ACTION="off" - - # Notices script when doing memory offline --PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" --PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" -+# PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" -+# PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" -+PAGE_CE_OFFLINE_PRE_NOTICE="" -+PAGE_CE_OFFLINE_POST_NOTICE="" - - # Event Trigger - -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index 193d47c..3c777e6 100644 ---- a/ras-page-isolation.c -+++ b/ras-page-isolation.c -@@ -217,7 +217,7 @@ static void page_notice_init(void) - char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); - char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); - -- if (offline <= OFFLINE_ACCOUNT) -+ if (offline <= OFFLINE_ACCOUNT || !pre_re || !post_re) - return; - - snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); --- -2.33.1 - diff --git a/30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch b/30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch new file mode 100644 index 0000000..240baaf --- /dev/null +++ b/30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch @@ -0,0 +1,524 @@ +commit 30158ef8d7aebc3e5201bf39b73ce7644f8e419e +Author: Avadhut Naik +Date: Tue Apr 18 18:24:21 2023 +0000 + + rasdaemon: Update SMCA bank error descriptions + + Update, reword some existing SMCA bank type error descriptions to extend + SMCA error decoding functionality for modern AMD processors. Additionally, + also add new error descriptions for missing SMCA bank types. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 27ca8aa..7ec787a 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -66,12 +66,19 @@ enum smca_bank_types { + SMCA_SMU, /* System Management Unit */ + SMCA_SMU_V2, + SMCA_MP5, /* Microprocessor 5 Unit */ ++ SMCA_MPDMA, /* MPDMA Unit */ + SMCA_NBIO, /* Northbridge IO Unit */ + SMCA_PCIE, /* PCI Express Unit */ + SMCA_PCIE_V2, + SMCA_XGMI_PCS, /* xGMI PCS Unit */ ++ SMCA_NBIF, /*NBIF Unit */ ++ SMCA_SHUB, /* System Hub Unit */ ++ SMCA_SATA, /* SATA Unit */ ++ SMCA_USB, /* USB Unit */ ++ SMCA_GMI_PCS, /* GMI PCS Unit */ + SMCA_XGMI_PHY, /* xGMI PHY Unit */ + SMCA_WAFL_PHY, /* WAFL PHY Unit */ ++ SMCA_GMI_PHY, /* GMI PHY Unit */ + N_SMCA_BANK_TYPES + }; + +@@ -85,7 +92,6 @@ enum smca_bank_types { + #define NONCPU_NODE_INDEX 8 + + /* SMCA Extended error strings */ +-/* Load Store */ + static const char * const smca_ls_mce_desc[] = { + "Load queue parity", + "Store queue parity", +@@ -109,6 +115,7 @@ static const char * const smca_ls_mce_desc[] = { + "DC tag error type 5", + "L2 fill data error", + }; ++ + static const char * const smca_ls2_mce_desc[] = { + "An ECC error was detected on a data cache read by a probe or victimization", + "An ECC error or L2 poison was detected on a data cache read by a load", +@@ -133,92 +140,104 @@ static const char * const smca_ls2_mce_desc[] = { + "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", + "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", + "A hardware assertion error was reported", +- "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", ++ "A parity error was detected in an STLF, SCB EMEM entry, store data mask or SRB store data by any access", + }; +-/* Instruction Fetch */ ++ + static const char * const smca_if_mce_desc[] = { + "microtag probe port parity error", + "IC microtag or full tag multi-hit error", + "IC full tag parity", + "IC data array parity", +- "Decoupling queue phys addr parity error", ++ "PRQ Parity Error", + "L0 ITLB parity error", +- "L1 ITLB parity error", +- "L2 ITLB parity error", ++ "L1-TLB parity error", ++ "L2-TLB parity error", + "BPQ snoop parity on Thread 0", + "BPQ snoop parity on Thread 1", +- "L1 BTB multi-match error", +- "L2 BTB multi-match error", ++ "BP L1-BTB Multi-Hit Error", ++ "BP L2-BTB Multi-Hit Error", + "L2 Cache Response Poison error", +- "System Read Data error", ++ "L2 Cache Error Response", ++ "Hardware Assertion Error", ++ "L1-TLB Multi-Hit", ++ "L2-TLB Multi-Hit", ++ "BSR Parity Error", ++ "CT MCE", + }; +-/* L2 Cache */ ++ + static const char * const smca_l2_mce_desc[] = { +- "L2M tag multi-way-hit error", +- "L2M tag ECC error", +- "L2M data ECC error", +- "HW assert", ++ "L2M Tag Multiple-Way-Hit error", ++ "L2M Tag or State Array ECC Error", ++ "L2M Data Array ECC Error", ++ "Hardware Assert Error", ++ "SDP Read Response Parity Error", + }; +-/* Decoder Unit */ ++ + static const char * const smca_de_mce_desc[] = { +- "uop cache tag parity error", +- "uop cache data parity error", +- "Insn buffer parity error", +- "uop queue parity error", +- "Insn dispatch queue parity error", +- "Fetch address FIFO parity", +- "Patch RAM data parity", +- "Patch RAM sequencer parity", +- "uop buffer parity" +-}; +-/* Execution Unit */ ++ "Micro-op cache tag array parity error", ++ "Micro-op cache data array parity error", ++ "IBB Register File parity error", ++ "Micro-op queue parity error", ++ "Instruction dispatch queue parity error", ++ "Fetch address FIFO parity error", ++ "Patch RAM data parity error", ++ "Patch RAM sequencer parity error", ++ "Micro-op buffer parity error", ++ "Hardware Assertion MCA Error", ++}; ++ + static const char * const smca_ex_mce_desc[] = { + "Watchdog timeout error", +- "Phy register file parity", +- "Flag register file parity", +- "Immediate displacement register file parity", +- "Address generator payload parity", +- "EX payload parity", +- "Checkpoint queue parity", +- "Retire dispatch queue parity", ++ "Physical register file parity error", ++ "Flag register file parity error", ++ "Immediate displacement register file parity error", ++ "Address generator payload parity error", ++ "EX payload parity error", ++ "Checkpoint queue parity error", ++ "Retire dispatch queue parity error", + "Retire status queue parity error", +- "Scheduling queue parity error", ++ "Scheduler queue parity error", + "Branch buffer queue parity error", ++ "Hardware Assertion error", ++ "Spec Map parity error", ++ "Retire Map parity error", + }; +-/* Floating Point Unit */ ++ + static const char * const smca_fp_mce_desc[] = { +- "Physical register file parity", +- "Freelist parity error", +- "Schedule queue parity", ++ "Physical register file (PRF) parity error", ++ "Freelist (FL) parity error", ++ "Schedule queue parity error", + "NSQ parity error", +- "Retire queue parity", +- "Status register file parity", ++ "Retire queue (RQ) parity error", ++ "Status register file (SRF) parity error", + "Hardware assertion", ++ "Physical K mask register file (KRF) parity error", + }; +-/* L3 Cache */ ++ + static const char * const smca_l3_mce_desc[] = { + "Shadow tag macro ECC error", + "Shadow tag macro multi-way-hit error", + "L3M tag ECC error", + "L3M tag multi-way-hit error", + "L3M data ECC error", +- "XI parity, L3 fill done channel error", +- "L3 victim queue parity", +- "L3 HW assert", ++ "SDP Parity Error from XI", ++ "L3 victim queue Data Fabric error", ++ "L3 Hardware Assertion", ++ "XI WCB Parity Poison Creation event", + }; +-/* Coherent Slave Unit */ ++ + static const char * const smca_cs_mce_desc[] = { +- "Illegal request from transport layer", ++ "Illegal request", + "Address violation", + "Security violation", +- "Illegal response from transport layer", ++ "Illegal response", + "Unexpected response", +- "Parity error on incoming request or probe response data", +- "Parity error on incoming read response data", +- "Atomic request parity", +- "ECC error on probe filter access", ++ "Request or Probe Parity Error", ++ "Read Response Parity Error", ++ "Atomic request parity error", ++ "Probe Filter ECC Error", + }; +-/* Coherent Slave Unit V2 */ ++ + static const char * const smca_cs2_mce_desc[] = { + "Illegal Request", + "Address Violation", +@@ -234,15 +253,22 @@ static const char * const smca_cs2_mce_desc[] = { + "SDP read response had an unexpected RETRY error", + "Counter overflow error", + "Counter underflow error", ++ "Illegal Request on the no data channel", ++ "Address Violation on the no data channel", ++ "Security Violation on the no data channel", ++ "Hardware Assert Error", + }; +-/* Power, Interrupt, etc.. */ ++ + static const char * const smca_pie_mce_desc[] = { +- "HW assert", +- "Internal PIE register security violation", +- "Error on GMI link", +- "Poison data written to internal PIE register", ++ "Hardware assert", ++ "Register security violation", ++ "Link error", ++ "Poison data consumption", ++ "A deferred error was detected in the DF", ++ "Watch Dog Timer", ++ "An SRAM ECC error was detected in the CNLI block", + }; +-/* Unified Memory Controller */ ++ + static const char * const smca_umc_mce_desc[] = { + "DRAM ECC error", + "Data poison error on DRAM", +@@ -250,6 +276,12 @@ static const char * const smca_umc_mce_desc[] = { + "Advanced peripheral bus error", + "Command/address parity error", + "Write data CRC error", ++ "DCQ SRAM ECC error", ++ "AES SRAM ECC error", ++ "ECS Row Error", ++ "ECS Error", ++ "UMC Throttling Error", ++ "Read CRC Error", + }; + + static const char * const smca_umc2_mce_desc[] = { +@@ -267,15 +299,14 @@ static const char * const smca_umc2_mce_desc[] = { + "LM32 MP errors", + }; + +-/* Parameter Block */ + static const char * const smca_pb_mce_desc[] = { +- "Parameter Block RAM ECC error", ++ "An ECC error in the Parameter Block RAM array" + }; +-/* Platform Security Processor */ ++ + static const char * const smca_psp_mce_desc[] = { +- "PSP RAM ECC or parity error", ++ "An ECC or parity error in a PSP RAM instance", + }; +-/* Platform Security Processor V2 */ ++ + static const char * const smca_psp2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", +@@ -296,11 +327,11 @@ static const char * const smca_psp2_mce_desc[] = { + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", + }; +-/* System Management Unit */ ++ + static const char * const smca_smu_mce_desc[] = { +- "SMU RAM ECC or parity error", ++ "An ECC or parity error in an SMU RAM instance", + }; +-/* System Management Unit V2 */ ++ + static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", +@@ -314,7 +345,7 @@ static const char * const smca_smu2_mce_desc[] = { + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", + }; +-/* Microprocessor 5 Unit */ ++ + static const char * const smca_mp5_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", +@@ -327,15 +358,68 @@ static const char * const smca_mp5_mce_desc[] = { + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + }; +-/* Northbridge IO Unit */ ++ ++static const char * const smca_mpdma_mce_desc[] = { ++ "Main SRAM [31:0] bank ECC or parity error", ++ "Main SRAM [63:32] bank ECC or parity error", ++ "Main SRAM [95:64] bank ECC or parity error", ++ "Main SRAM [127:96] bank ECC or parity error", ++ "Data Cache Bank A ECC or parity error", ++ "Data Cache Bank B ECC or parity error", ++ "Data Tag Cache Bank A ECC or parity error", ++ "Data Tag Cache Bank B ECC or parity error", ++ "Instruction Cache Bank A ECC or parity error", ++ "Instruction Cache Bank B ECC or parity error", ++ "Instruction Tag Cache Bank A ECC or parity error", ++ "Instruction Tag Cache Bank B ECC or parity error", ++ "Data Cache Bank A ECC or parity error", ++ "Data Cache Bank B ECC or parity error", ++ "Data Tag Cache Bank A ECC or parity error", ++ "Data Tag Cache Bank B ECC or parity error", ++ "Instruction Cache Bank A ECC or parity error", ++ "Instruction Cache Bank B ECC or parity error", ++ "Instruction Tag Cache Bank A ECC or parity error", ++ "Instruction Tag Cache Bank B ECC or parity error", ++ "Data Cache Bank A ECC or parity error", ++ "Data Cache Bank B ECC or parity error", ++ "Data Tag Cache Bank A ECC or parity error", ++ "Data Tag Cache Bank B ECC or parity error", ++ "Instruction Cache Bank A ECC or parity error", ++ "Instruction Cache Bank B ECC or parity error", ++ "Instruction Tag Cache Bank A ECC or parity error", ++ "Instruction Tag Cache Bank B ECC or parity error", ++ "System Hub Read Buffer ECC or parity error", ++ "MPDMA TVF DVSEC Memory ECC or parity error", ++ "MPDMA TVF MMIO Mailbox0 ECC or parity error", ++ "MPDMA TVF MMIO Mailbox1 ECC or parity error", ++ "MPDMA TVF Doorbell Memory ECC or parity error", ++ "MPDMA TVF SDP Slave Memory 0 ECC or parity error", ++ "MPDMA TVF SDP Slave Memory 1 ECC or parity error", ++ "MPDMA TVF SDP Slave Memory 2 ECC or parity error", ++ "MPDMA TVF SDP Master Memory 0 ECC or parity error", ++ "MPDMA TVF SDP Master Memory 1 ECC or parity error", ++ "MPDMA TVF SDP Master Memory 2 ECC or parity error", ++ "MPDMA TVF SDP Master Memory 3 ECC or parity error", ++ "MPDMA TVF SDP Master Memory 4 ECC or parity error", ++ "MPDMA TVF SDP Master Memory 5 ECC or parity error", ++ "MPDMA TVF SDP Master Memory 6 ECC or parity error", ++ "SDP Watchdog Timer expired", ++ "MPDMA PTE Command FIFO ECC or parity error", ++ "MPDMA PTE Hub Data FIFO ECC or parity error", ++ "MPDMA PTE Internal Data FIFO ECC or parity error", ++ "MPDMA PTE Command Memory DMA ECC or parity error", ++ "MPDMA PTE Command Memory Internal ECC or parity error", ++}; ++ + static const char * const smca_nbio_mce_desc[] = { + "ECC or Parity error", + "PCIE error", +- "SDP ErrEvent error", +- "SDP Egress Poison Error", +- "IOHC Internal Poison Error", ++ "External SDP ErrEvent error", ++ "SDP Egress Poison error", ++ "Internal Poison error", ++ "Internal system fatal error event", + }; +-/* PCI Express Unit */ ++ + static const char * const smca_pcie_mce_desc[] = { + "CCIX PER Message logging", + "CCIX Read Response with Status: Non-Data Error", +@@ -345,7 +429,7 @@ static const char * const smca_pcie_mce_desc[] = { + }; + + static const char * const smca_pcie2_mce_desc[] = { +- "SDP Parity Error logging", ++ "SDP Data Parity Error logging", + }; + + static const char * const smca_xgmipcs_mce_desc[] = { +@@ -387,11 +471,66 @@ static const char * const smca_xgmiphy_mce_desc[] = { + "PHY APB error", + }; + +-static const char * const smca_waflphy_mce_desc[] = { +- "RAM ECC Error", +- "ARC instruction buffer parity error", +- "ARC data buffer parity error", +- "PHY APB error", ++static const char * const smca_nbif_mce_desc[] = { ++ "Timeout error from GMI", ++ "SRAM ECC error", ++ "NTB Error Event", ++ "SDP Parity error", ++}; ++ ++static const char * const smca_sata_mce_desc[] = { ++ "Parity error for port 0", ++ "Parity error for port 1", ++ "Parity error for port 2", ++ "Parity error for port 3", ++ "Parity error for port 4", ++ "Parity error for port 5", ++ "Parity error for port 6", ++ "Parity error for port 7", ++}; ++ ++static const char * const smca_usb_mce_desc[] = { ++ "Parity error or ECC error for S0 RAM0", ++ "Parity error or ECC error for S0 RAM1", ++ "Parity error or ECC error for S0 RAM2", ++ "Parity error for PHY RAM0", ++ "Parity error for PHY RAM1", ++ "AXI Slave Response error", ++}; ++ ++static const char * const smca_gmipcs_mce_desc[] = { ++ "Data Loss Error", ++ "Training Error", ++ "Replay Parity Error", ++ "Rx Fifo Underflow Error", ++ "Rx Fifo Overflow Error", ++ "CRC Error", ++ "BER Exceeded Error", ++ "Tx Fifo Underflow Error", ++ "Replay Buffer Parity Error", ++ "Tx Overflow Error", ++ "Replay Fifo Overflow Error", ++ "Replay Fifo Underflow Error", ++ "Elastic Fifo Overflow Error", ++ "Deskew Error", ++ "Offline Error", ++ "Data Startup Limit Error", ++ "FC Init Timeout Error", ++ "Recovery Timeout Error", ++ "Ready Serial Timeout Error", ++ "Ready Serial Attempt Error", ++ "Recovery Attempt Error", ++ "Recovery Relock Attempt Error", ++ "Deskew Abort Error", ++ "Rx Buffer Error", ++ "Rx LFDS Fifo Overflow Error", ++ "Rx LFDS Fifo Underflow Error", ++ "LinkSub Tx Timeout Error", ++ "LinkSub Rx Timeout Error", ++ "Rx CMD Packet Error", ++ "LFDS Training Timeout Error", ++ "LFDS FC Init Timeout Error", ++ "Data Loss Error", + }; + + struct smca_mce_desc { +@@ -419,12 +558,21 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)}, + [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, ++ [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) }, + [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, + [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, + [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) }, + [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) }, ++ /* NBIF and SHUB have the same error descriptions, for now. */ ++ [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) }, ++ [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) }, ++ [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) }, ++ [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) }, ++ [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) }, ++ /* All the PHY bank types have the same error descriptions, for now. */ + [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, +- [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) }, ++ [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, ++ [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, + }; + + struct smca_hwid { +@@ -470,6 +618,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + /* Microprocessor 5 Unit MCA type */ + { SMCA_MP5, 0x00020001 }, + ++ /* MPDMA MCA Type */ ++ { SMCA_MPDMA, 0x00030001 }, ++ + /* Northbridge IO Unit MCA type */ + { SMCA_NBIO, 0x00000018 }, + +@@ -480,11 +631,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + /* Ext Global Memory Interconnect PCS MCA type */ + { SMCA_XGMI_PCS, 0x00000050 }, + ++ { SMCA_NBIF, 0x0000006C }, ++ ++ { SMCA_SHUB, 0x00000080 }, ++ { SMCA_SATA, 0x000000A8 }, ++ { SMCA_USB, 0x000000AA }, ++ { SMCA_GMI_PCS, 0x00000241 }, ++ + /* Ext Global Memory Interconnect PHY MCA type */ + { SMCA_XGMI_PHY, 0x00000259 }, + + /* WAFL PHY MCA type */ + { SMCA_WAFL_PHY, 0x00000267 }, ++ ++ { SMCA_GMI_PHY, 0x00000269 }, + }; + + struct smca_bank_name { +@@ -508,12 +668,18 @@ static struct smca_bank_name smca_names[] = { + [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" }, + [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" }, + [SMCA_MP5] = { "Microprocessor 5 Unit" }, ++ [SMCA_MPDMA] = { "MPDMA Unit" }, + [SMCA_NBIO] = { "Northbridge IO Unit" }, + [SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" }, + [SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" }, ++ [SMCA_NBIF] = { "NBIF Unit" }, ++ [SMCA_SHUB] = { "System Hub Unit" }, ++ [SMCA_SATA] = { "SATA Unit" }, ++ [SMCA_USB] = { "USB Unit" }, ++ [SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" }, + [SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" }, + [SMCA_WAFL_PHY] = { "WAFL PHY Unit" }, +- ++ [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + + static void amd_decode_errcode(struct mce_event *e) diff --git a/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch b/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch new file mode 100644 index 0000000..2655de8 --- /dev/null +++ b/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch @@ -0,0 +1,93 @@ +commit 73d8177ce0d2fcb7693cacee4778d0845ebd3788 +Author: sathya priya kumar +Date: Thu Jun 13 05:29:09 2024 +0000 + + rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits + + Optimize smca_smu2_mce_desc in better way from the commit ced615c. + + Update existing array with extended error descriptions instead + of creating new array, simplifying the code. + + Signed-off-by: Sathya Priya Kumar + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 29 +++-------------------------- + ras-mce-handler.h | 1 - + 2 files changed, 3 insertions(+), 27 deletions(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-07-18 11:14:26.008582740 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-07-18 11:15:05.510270132 -0400 +@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d + "An ECC or parity error in an SMU RAM instance", + }; + +-static const char * smca_smu2_mce_desc[64] = { ++static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", +@@ -410,14 +410,13 @@ static const char * smca_smu2_mce_desc[6 + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", + "PHY RAS ECC Error", +-}; +- +-static const char * smca_smu2_ext_mce_desc[] = { ++ [12 ... 57] = "Reserved", + "A correctable error from a GFX Sub-IP", + "A fatal error from a GFX Sub-IP", + "Reserved", + "Reserved", + "A poison error from a GFX Sub-IP", ++ "Reserved", + }; + + static const char * const smca_mp5_mce_desc[] = { +@@ -824,27 +823,6 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + +-void smca_smu2_ext_err_desc(void) +-{ +- int i, j; +- int smu2_bits = 62; +- +- /* +- * MCA_CTL_SMU error stings are defined for b'58:59 and b'62 +- * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU +- * +- * b'0:11 can be decoded from existing array smca_smu2_mce_desc. +- * b'12:57 are Reserved and b'58:62 are appended to the +- * smca_smu2_mce_desc. +- */ +- for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) { +- for ( ; i < 58; i++) +- smca_smu2_mce_desc[i] = "Reserved"; +- +- smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j]; +- } +-} +- + void amd_decode_errcode(struct mce_event *e) + { + +@@ -936,7 +914,6 @@ unsigned short xec = (e->status >> 16) & + mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, + (ipid_high & MCI_IPID_MCATYPE) >> 16); + +- smca_smu2_ext_err_desc(); + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-07-18 11:14:26.008582740 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-07-18 11:14:28.987559165 -0400 +@@ -121,7 +121,6 @@ int set_intel_imc_log(enum cputype cputy + /* Undertake AMD SMCA Error Decoding */ + void decode_smca_error(struct mce_event *e, struct mce_priv *m); + void amd_decode_errcode(struct mce_event *e); +-void smca_smu2_ext_err_desc(void); + + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); diff --git a/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch b/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch new file mode 100644 index 0000000..b9615bd --- /dev/null +++ b/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch @@ -0,0 +1,34 @@ +commit 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e +Author: Aristeu Rozanski +Date: Tue Apr 9 10:06:30 2024 -0400 + + mce-amd-smca: update smca_hwid to use smca_bank_types + + bank_type is used as smca_bank_types everywhere, there's no point in + declaring it as unsigned int. It also upsets covscan: + + 3. rasdaemon-0.6.7/mce-amd-smca.c:914: assignment: Assigning: "bank_type" = "s_hwid->bank_type". + 7. rasdaemon-0.6.7/mce-amd-smca.c:926: cond_at_most: Checking "bank_type >= 64U" implies that "bank_type" and "s_hwid->bank_type" may be up to 63 on the false branch. + 14. rasdaemon-0.6.7/mce-amd-smca.c:942: overrun-local: Overrunning array "smca_mce_descs" of 38 16-byte elements at element index 63 (byte offset 1023) using index "bank_type" (which evaluates to 63). + # 940| /* Only print the descriptor of valid extended error code */ + # 941| if (xec < smca_mce_descs[bank_type].num_descs) + # 942|-> mce_snprintf(e->mcastatus_msg, + # 943| "%s. Ext Err Code: %d", + # 944| smca_mce_descs[bank_type].descs[xec], + + Signed-off-by: Aristeu Rozanski + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 7521ff7..6632663 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -706,7 +706,7 @@ static struct smca_mce_desc smca_mce_descs[] = { + }; + + struct smca_hwid { +- unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/ ++ enum smca_bank_types bank_type; + uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/ + }; + diff --git a/885e546add918457c453bd3f753ac7df90b39e36.patch b/885e546add918457c453bd3f753ac7df90b39e36.patch new file mode 100644 index 0000000..e5a2e94 --- /dev/null +++ b/885e546add918457c453bd3f753ac7df90b39e36.patch @@ -0,0 +1,22 @@ +commit 885e546add918457c453bd3f753ac7df90b39e36 +Author: weidongkl +Date: Tue Sep 19 16:29:21 2023 +0800 + + Add a space between "diskerror_event" and "store" + + Signed-off-by: weidongkl + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-record.c b/ras-record.c +index a5f99ae..6b050bb 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -484,7 +484,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev + + if (!priv || !priv->stmt_diskerror_event) + return 0; +- log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event); ++ log(TERM, LOG_INFO, "diskerror_event store: %p\n", priv->stmt_diskerror_event); + + sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL); + sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL); diff --git a/932118b04a04104dfac6b8536419803f236e6118.patch b/932118b04a04104dfac6b8536419803f236e6118.patch new file mode 100644 index 0000000..b88923f --- /dev/null +++ b/932118b04a04104dfac6b8536419803f236e6118.patch @@ -0,0 +1,411 @@ +commit 932118b04a04104dfac6b8536419803f236e6118 +Author: Avadhut Naik +Date: Mon May 22 22:13:17 2023 +0000 + + rasdaemon: Add support for post-processing MCA errors + + Currently, the rasdaemon performs detailed error decoding of received + MCA errors on the system only whence it is running, either as a daemon + or in the foreground. + + As such, error decoding cannot be undertaken for any MCA errors received + whence the rasdaemon wasn't running. Additionally, if the error decoding + modules like edac_mce_amd too have not been loaded, error records in the + demsg buffer might correspond to raw values in associated MSRs, compelling + users to undertake decoding manually. The scenario seems more plausible on + AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA + Extended Error Descriptions from the edac_mce_amd module in an effort to + offload SMCA Error Decoding to the rasdaemon. + + As such, add support to post-process and decode MCA Errors received on AMD + SMCA systems from raw MSR values. Support for post-processing and decoding + of MCA Errors received on CPUs of other vendors can be added in the future, + as needed. + + Suggested-by: Yazen Ghannam + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 8 ++- + ras-events.h | 1 + ras-mce-handler.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++---- + ras-mce-handler.h | 4 + + ras-record.h | 10 ++++ + rasdaemon.c | 94 +++++++++++++++++++++++++++++++++++++++++++++- + 6 files changed, 216 insertions(+), 11 deletions(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2023-10-27 12:44:55.541077722 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2023-10-27 12:44:58.549049019 -0400 +@@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + +-static void amd_decode_errcode(struct mce_event *e) ++void amd_decode_errcode(struct mce_event *e) + { + + decode_amd_errcode(e); +@@ -782,7 +782,7 @@ *hwid_mcatype = 0x00010000; + } + + /* Decode extended errors according to Scalable MCA specification */ +-static void decode_smca_error(struct mce_event *e, struct mce_priv* m) ++void decode_smca_error(struct mce_event *e, struct mce_priv *m) + { + enum smca_bank_types bank_type; + const char *ip_name; +@@ -827,7 +827,9 @@ for (i = 0; i < ARRAY_SIZE(smca_hwid_mca + /* Only print the descriptor of valid extended error code */ + if (xec < smca_mce_descs[bank_type].num_descs) + mce_snprintf(e->mcastatus_msg, +- " %s.\n", smca_mce_descs[bank_type].descs[xec]); ++ "%s. Ext Err Code: %d", ++ smca_mce_descs[bank_type].descs[xec], ++ xec); + + if (bank_type == SMCA_UMC && xec == 0) { + channel = find_umc_channel(e); +--- rasdaemon-0.6.7.orig/ras-events.h 2023-10-27 12:44:55.541077722 -0400 ++++ rasdaemon-0.6.7/ras-events.h 2023-10-27 12:44:58.549049019 -0400 +@@ -100,6 +100,7 @@ enum ghes_severity { + + /* Function prototypes */ + int toggle_ras_mc_event(int enable); ++int ras_offline_mce_event(struct ras_mc_offline_event *event); + int handle_ras_events(int record_events); + + #endif +--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2023-10-27 12:44:55.541077722 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.c 2023-10-27 12:45:27.159776011 -0400 +@@ -63,10 +63,8 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series + [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server", + }; + +-static enum cputype select_intel_cputype(struct ras_events *ras) ++static enum cputype select_intel_cputype(struct mce_priv *mce) + { +- struct mce_priv *mce = ras->mce_priv; +- + if (mce->family == 15) { + if (mce->model == 6) + return CPU_TULSA; +@@ -140,9 +138,8 @@ if (mce->model > 0x1a) { + return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC; + } + +-static int detect_cpu(struct ras_events *ras) ++static int detect_cpu(struct mce_priv *mce) + { +- struct mce_priv *mce = ras->mce_priv; + FILE *f; + int ret = 0; + char *line = NULL; +@@ -221,7 +218,7 @@ ret = 0; + } + goto ret; + } else if (!strcmp(mce->vendor,"GenuineIntel")) { +- mce->cputype = select_intel_cputype(ras); ++ mce->cputype = select_intel_cputype(mce); + } else { + ret = EINVAL; + } +@@ -246,7 +243,7 @@ int register_mce_handler(struct ras_even + + mce = ras->mce_priv; + +- rc = detect_cpu(ras); ++ rc = detect_cpu(mce); + if (rc) { + if (mce->processor_flags) + free (mce->processor_flags); +@@ -383,6 +380,105 @@ #if 0 + */ + } + ++static int report_mce_offline(struct trace_seq *s, ++ struct mce_event *mce, ++ struct mce_priv *priv) ++{ ++ time_t now; ++ struct tm *tm; ++ ++ time(&now); ++ tm = localtime(&now); ++ ++ if (tm) ++ strftime(mce->timestamp, sizeof(mce->timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ trace_seq_printf(s, "%s,", mce->timestamp); ++ ++ if (*mce->bank_name) ++ trace_seq_printf(s, " %s,", mce->bank_name); ++ else ++ trace_seq_printf(s, " bank=%x,", mce->bank); ++ ++ if (*mce->mcastatus_msg) ++ trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg); ++ ++ if (*mce->mcistatus_msg) ++ trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg); ++ ++ if (*mce->mc_location) ++ trace_seq_printf(s, " Locn: %s,", mce->mc_location); ++ ++ if (*mce->error_msg) ++ trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg); ++ ++ return 0; ++} ++ ++int ras_offline_mce_event(struct ras_mc_offline_event *event) ++{ ++ int rc = 0; ++ struct trace_seq s; ++ struct mce_event *mce = NULL; ++ struct mce_priv *priv = NULL; ++ ++ mce = (struct mce_event *)calloc(1, sizeof(struct mce_event)); ++ if (!mce) { ++ log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n"); ++ return errno; ++ } ++ ++ priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv)); ++ if (!priv) { ++ log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n"); ++ free(mce); ++ return errno; ++ } ++ ++ if (event->smca) { ++ priv->cputype = CPU_AMD_SMCA; ++ priv->family = event->family; ++ priv->model = event->model; ++ } else { ++ rc = detect_cpu(priv); ++ if (rc) { ++ log(TERM, LOG_ERR, "Failed to detect CPU\n"); ++ goto free_mce; ++ } ++ } ++ ++ mce->status = event->status; ++ mce->bank = event->bank; ++ ++ switch (priv->cputype) { ++ case CPU_AMD_SMCA: ++ mce->synd = event->synd; ++ mce->ipid = event->ipid; ++ if (!mce->ipid || !mce->status) { ++ log(TERM, LOG_ERR, "%s MSR required.\n", ++ mce->ipid ? "Status" : "Ipid"); ++ rc = -EINVAL; ++ goto free_mce; ++ } ++ decode_smca_error(mce, priv); ++ amd_decode_errcode(mce); ++ break; ++ default: ++ break; ++ } ++ ++ trace_seq_init(&s); ++ report_mce_offline(&s, mce, priv); ++ trace_seq_do_printf(&s); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++ ++free_mce: ++ free(priv); ++ free(mce); ++ return rc; ++} ++ + int ras_mce_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context) +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2023-10-27 12:44:55.541077722 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2023-10-27 12:44:58.550049010 -0400 +@@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_s + /* enables intel iMC logs */ + int set_intel_imc_log(enum cputype cputype, unsigned ncpus); + ++/* Undertake AMD SMCA Error Decoding */ ++void decode_smca_error(struct mce_event *e, struct mce_priv *m); ++void amd_decode_errcode(struct mce_event *e); ++ + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); + void core2_decode_model(struct mce_event *e); +--- rasdaemon-0.6.7.orig/ras-record.h 2023-10-27 12:44:55.541077722 -0400 ++++ rasdaemon-0.6.7/ras-record.h 2023-10-27 12:44:58.550049010 -0400 +@@ -21,6 +21,7 @@ * Foundation, Inc., 51 Franklin Street, + #define __RAS_RECORD_H + + #include ++#include + #include "config.h" + + #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) +@@ -39,6 +40,15 @@ struct ras_mc_event { + const char *driver_detail; + }; + ++struct ras_mc_offline_event { ++ unsigned int family, model; ++ bool smca; ++ uint8_t bank; ++ uint64_t ipid; ++ uint64_t synd; ++ uint64_t status; ++}; ++ + struct ras_aer_event { + char timestamp[64]; + const char *error_type; +--- rasdaemon-0.6.7.orig/rasdaemon.c 2023-10-27 12:44:55.541077722 -0400 ++++ rasdaemon-0.6.7/rasdaemon.c 2023-10-27 12:44:58.550049010 -0400 +@@ -41,8 +41,21 @@ struct arguments { + int record_events; + int enable_ras; + int foreground; ++ int offline; + }; + ++enum OFFLINE_ARG_KEYS { ++ SMCA = 0x100, ++ MODEL, ++ FAMILY, ++ BANK_NUM, ++ IPID_REG, ++ STATUS_REG, ++ SYNDROME_REG ++}; ++ ++struct ras_mc_offline_event event; ++ + static error_t parse_opt(int k, char *arg, struct argp_state *state) + { + struct arguments *args = state->input; +@@ -62,18 +75,84 @@ static error_t parse_opt(int k, char *ar + case 'f': + args->foreground++; + break; ++#ifdef HAVE_MCE ++ case 'p': ++ if (state->argc < 4) ++ argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR); ++ args->offline++; ++ break; ++#endif + default: + return ARGP_ERR_UNKNOWN; + } + return 0; + } + ++#ifdef HAVE_MCE ++static error_t parse_opt_offline(int key, char *arg, ++ struct argp_state *state) ++{ ++ switch (key) { ++ case SMCA: ++ event.smca = true; ++ break; ++ case MODEL: ++ event.model = strtoul(state->argv[state->next], NULL, 0); ++ break; ++ case FAMILY: ++ event.family = strtoul(state->argv[state->next], NULL, 0); ++ break; ++ case BANK_NUM: ++ event.bank = atoi(state->argv[state->next]); ++ break; ++ case IPID_REG: ++ event.ipid = strtoull(state->argv[state->next], NULL, 0); ++ break; ++ case STATUS_REG: ++ event.status = strtoull(state->argv[state->next], NULL, 0); ++ break; ++ case SYNDROME_REG: ++ event.synd = strtoull(state->argv[state->next], NULL, 0); ++ break; ++ default: ++ return ARGP_ERR_UNKNOWN; ++ } ++ return 0; ++} ++#endif ++ + long user_hz; + + int main(int argc, char *argv[]) + { + struct arguments args; + int idx = -1; ++ ++#ifdef HAVE_MCE ++ const struct argp_option offline_options[] = { ++ {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, ++ {"model", MODEL, 0, 0, "CPU Model"}, ++ {"family", FAMILY, 0, 0, "CPU Family"}, ++ {"bank", BANK_NUM, 0, 0, "Bank Number"}, ++ {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"}, ++ {"status", STATUS_REG, 0, 0, "Status Register"}, ++ {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"}, ++ {0, 0, 0, 0, 0, 0}, ++ }; ++ ++ struct argp offline_argp = { ++ .options = offline_options, ++ .parser = parse_opt_offline, ++ .doc = TOOL_DESCRIPTION, ++ .args_doc = ARGS_DOC, ++ }; ++ ++ struct argp_child offline_parser[] = { ++ {&offline_argp, 0, "Post-Processing Options:", 0}, ++ {0, 0, 0, 0}, ++ }; ++#endif ++ + const struct argp_option options[] = { + {"enable", 'e', 0, 0, "enable RAS events and exit", 0}, + {"disable", 'd', 0, 0, "disable RAS events and exit", 0}, +@@ -81,6 +160,10 @@ {"disable", 'd', 0, 0, "disable RAS even + {"record", 'r', 0, 0, "record events via sqlite3", 0}, + #endif + {"foreground", 'f', 0, 0, "run foreground, not daemonize"}, ++#ifdef HAVE_MCE ++ {"post-processing", 'p', 0, 0, ++ "Post-processing MCE's with raw register values"}, ++#endif + + { 0, 0, 0, 0, 0, 0 } + }; +@@ -89,7 +172,9 @@ { 0, 0, 0, 0, 0, 0 } + .parser = parse_opt, + .doc = TOOL_DESCRIPTION, + .args_doc = ARGS_DOC, +- ++#ifdef HAVE_MCE ++ .children = offline_parser, ++#endif + }; + memset (&args, 0, sizeof(args)); + +@@ -111,6 +196,13 @@ enable = (args.enable_ras > 0) ? 1 : 0; + return 0; + } + ++#ifdef HAVE_MCE ++ if (args.offline) { ++ ras_offline_mce_event(&event); ++ return 0; ++ } ++#endif ++ + openlog(TOOL_NAME, 0, LOG_DAEMON); + if (!args.foreground) + if (daemon(0,0)) diff --git a/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch b/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch new file mode 100644 index 0000000..adecd79 --- /dev/null +++ b/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch @@ -0,0 +1,24 @@ +commit 9bd84aef87978b806178a73ed33c39d6c442fc1f +Author: weidong +Date: Tue Aug 8 08:59:12 2023 +0000 + + add ':' before error output + + All prints except disk are preceded by a colon + + Signed-off-by: weidong + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index dc326d3..13078c2 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1469,7 +1469,7 @@ sub errors + $out .= "\n"; + } + if ($out ne "") { +- print "Disk errors\n$out\n"; ++ print "Disk errors:\n$out\n"; + } else { + print "No disk errors.\n\n"; + } diff --git a/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch b/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch new file mode 100644 index 0000000..fe85c48 --- /dev/null +++ b/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch @@ -0,0 +1,117 @@ +commit 9c86f6255f67a8bae28cd46c54500fc16bfc7a30 +Author: Yang Shi +Date: Mon Apr 4 16:34:05 2022 -0700 + + rasdaemon: use the new block_rq_error tracepoint + + Since Linux 5.18-rc1 a new block tracepoint called block_rq_error is + available for tracing disk error events dedicatedly. Currently + rasdaemon is using block_rq_complete which also traces successful cases. + It incurs excessive tracing logs and somehow overhead since the event is + triggered quite often. + + Use the new tracepoint for disk error reporting, and the new trace point + has the same format as block_rq_complete. + + Signed-off-by: Yang Shi + Signed-off-by: Mauro Carvalho Chehab + +--- + ras-events.c | 53 ++++++++++------------------------------------------- + ras-record.c | 2 +- + 2 files changed, 11 insertions(+), 44 deletions(-) + +--- rasdaemon-0.6.7.orig/ras-events.c 2024-05-14 11:05:40.020599541 -0400 ++++ rasdaemon-0.6.7/ras-events.c 2024-05-14 11:06:38.831067957 -0400 +@@ -27,6 +27,7 @@ * Foundation, Inc., 51 Franklin Street, + #include + #include + #include ++#include + #include "libtrace/kbuffer.h" + #include "libtrace/event-parse.h" + #include "ras-mc-handler.h" +@@ -229,7 +230,7 @@ if (rc < 0) { + #endif + + #ifdef HAVE_DISKERROR +- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable); ++ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_error", enable); + #endif + + #ifdef HAVE_MEMORY_FAILURE +@@ -241,37 +242,6 @@ free_ras: + return rc; + } + +-/* +- * Set kernel filter. libtrace doesn't provide an API for setting filters +- * in kernel, we have to implement it here. +- */ +-static int filter_ras_mc_event(struct ras_events *ras, char *group, char *event, +- const char *filter_str) +-{ +- int fd, rc; +- char fname[MAX_PATH + 1]; +- +- snprintf(fname, sizeof(fname), "events/%s/%s/filter", group, event); +- fd = open_trace(ras, fname, O_RDWR | O_APPEND); +- if (fd < 0) { +- log(ALL, LOG_WARNING, "Can't open filter file\n"); +- return errno; +- } +- +- rc = write(fd, filter_str ,strlen(filter_str)); +- if (rc < 0) { +- log(ALL, LOG_WARNING, "Can't write to filter file\n"); +- close(fd); +- return rc; +- } +- close(fd); +- if (!rc) { +- log(ALL, LOG_WARNING, "Nothing was written on filter file\n"); +- return EIO; +- } +- +- return 0; +-} + + /* + * Tracing read code +@@ -901,17 +871,14 @@ (void)open("/sys/kernel/debug/ras/daemon + #endif + + #ifdef HAVE_DISKERROR +- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0"); +- if (!rc) { +- rc = add_event_handler(ras, pevent, page_size, "block", +- "block_rq_complete", ras_diskerror_event_handler, +- NULL, DISKERROR_EVENT); +- if (!rc) +- num_events++; +- else +- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", +- "block", "block_rq_complete"); +- } ++ rc = add_event_handler(ras, pevent, page_size, "block", ++ "block_rq_error", ras_diskerror_event_handler, ++ NULL, DISKERROR_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "block", "block_rq_error"); + #endif + + #ifdef HAVE_MEMORY_FAILURE +--- rasdaemon-0.6.7.orig/ras-record.c 2024-05-14 11:07:24.573654494 -0400 ++++ rasdaemon-0.6.7/ras-record.c 2024-05-14 11:07:07.626807674 -0400 +@@ -456,7 +456,7 @@ return 0; + #endif + + /* +- * Table and functions to handle block:block_rq_complete ++ * Table and functions to handle block:block_rq_error + */ + + #ifdef HAVE_DISKERROR diff --git a/aa36c96cd52d775570dae989dd95a060f1149077.patch b/aa36c96cd52d775570dae989dd95a060f1149077.patch new file mode 100644 index 0000000..5655bc1 --- /dev/null +++ b/aa36c96cd52d775570dae989dd95a060f1149077.patch @@ -0,0 +1,159 @@ +commit aa36c96cd52d775570dae989dd95a060f1149077 +Author: Avadhut Naik +Date: Mon Apr 24 20:35:56 2023 +0000 + + rasdaemon: Handle reassigned bit definitions for CS SMCA + + Currently, on AMD systems with Scalable MCA (SMCA), each machine check + error of a SMCA bank type has an associated bit position in the bank's + control (CTL) register used for enabling / disabling reporting of the + very error. An error's bit position in the CTL register is also used + during error decoding for offsetting into the corresponding bank's error + description structure. As new errors are being added in newer AMD systems + for existing SMCA bank types, the underlying SMCA architecture guarantees + that the bit positions of existing errors are not altered. + + However, on some AMD systems viz. Genoa, some of the existing bit + definitions in the CTL register of the Coherent Slave (CS) SMCA bank type + are reassigned without defining new HWID and McaType. Consequently, the + very errors whose bit definitions have been reassigned in the CTL register + are being erroneously decoded. + + As a solution, create a new software defined SMCA bank type by utilizing + one of the hardware-reserved values for HWID. The new SMCA bank type will + only be employed for CS error decoding on affected CPU models. + + Additionally, since the existing error description structure for the CS + SMCA bank type is still valid, add new error description structure to + compensate for the reassigned bit definitions. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 7ec787a..e81f732 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -57,6 +57,7 @@ enum smca_bank_types { + SMCA_L3_CACHE, /* L3 Cache */ + SMCA_CS, /* Coherent Slave */ + SMCA_CS_V2, ++ SMCA_CS_V2_QUIRK, + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_UMC_V2, +@@ -259,6 +260,31 @@ static const char * const smca_cs2_mce_desc[] = { + "Hardware Assert Error", + }; + ++/* ++ * Per Genoa's revision guide, erratum 1384, existing bit definitions ++ * are reassigned for SMCA CS bank type. ++ */ ++static const char * const smca_cs2_quirk_mce_desc[] = { ++ "Illegal Request", ++ "Address Violation", ++ "Security Violation", ++ "Illegal Response", ++ "Unexpected Response", ++ "Request or Probe Parity Error", ++ "Read Response Parity Error", ++ "Atomic Request Parity Error", ++ "SDP read response had no match in the CS queue", ++ "SDP read response had an unexpected RETRY error", ++ "Counter overflow error", ++ "Counter underflow error", ++ "Probe Filter Protocol Error", ++ "Probe Filter ECC Error", ++ "Illegal Request on the no data channel", ++ "Address Violation on the no data channel", ++ "Security Violation on the no data channel", ++ "Hardware Assert Error", ++}; ++ + static const char * const smca_pie_mce_desc[] = { + "Hardware assert", + "Register security violation", +@@ -549,6 +575,7 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, + [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, ++ [SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)}, + [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, + [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, + [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, +@@ -597,6 +624,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + /* Data Fabric MCA types */ + { SMCA_CS, 0x0000002E }, + { SMCA_CS_V2, 0x0002002E }, ++ {SMCA_CS_V2_QUIRK, 0x00010000 }, + { SMCA_PIE, 0x0001002E }, + + /* Unified Memory Controller MCA type */ +@@ -660,7 +688,7 @@ static struct smca_bank_name smca_names[] = { + [SMCA_EX] = { "Execution Unit" }, + [SMCA_FP] = { "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "L3 Cache" }, +- [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" }, ++ [SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" }, + [SMCA_PIE] = { "Power, Interrupts, etc." }, + [SMCA_UMC] = { "Unified Memory Controller" }, + [SMCA_UMC_V2] = { "Unified Memory Controller V2" }, +@@ -723,8 +751,38 @@ static int find_hbm_channel(struct mce_event *e) + return (umc % 2) ? tmp + 4 : tmp; + } + ++static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype) ++{ ++ if (m->family == 0x19) { ++ switch (m->model) { ++ /* ++ * Per Genoa's revision guide, erratum 1384, some SMCA Extended ++ * Error Codes and SMCA Control bits are incorrect for SMCA CS ++ * bank type. ++ */ ++ case 0x10 ... 0x1F: ++ case 0x60 ... 0x7B: ++ case 0xA0 ... 0xAF: ++ if (*hwid_mcatype == 0x0002002E) ++ *hwid_mcatype = 0x00010000; ++ break; ++ default: ++ break; ++ } ++ } else if (m->family == 0x1A) { ++ switch (m->model) { ++ case 0x40 ... 0x4F: ++ if (*hwid_mcatype == 0x0002002E) ++ *hwid_mcatype = 0x00010000; ++ break; ++ default: ++ break; ++ } ++ } ++} ++ + /* Decode extended errors according to Scalable MCA specification */ +-static void decode_smca_error(struct mce_event *e) ++static void decode_smca_error(struct mce_event *e, struct mce_priv* m) + { + enum smca_bank_types bank_type; + const char *ip_name; +@@ -735,6 +793,8 @@ static void decode_smca_error(struct mce_event *e) + unsigned int csrow = -1, channel = -1; + unsigned int i; + ++ fixup_hwid(m, &mcatype_hwid); ++ + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + s_hwid = &smca_hwid_mcatypes[i]; + if (mcatype_hwid == s_hwid->mcatype_hwid) { +@@ -801,7 +861,7 @@ int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) + if (mcgstatus & MCG_STATUS_MCIP) + mce_snprintf(e->mcgstatus_msg, "MCIP"); + +- decode_smca_error(e); ++ decode_smca_error(e, ras->mce_priv); + amd_decode_errcode(e); + return 0; + } diff --git a/b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch b/b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch new file mode 100644 index 0000000..fa55654 --- /dev/null +++ b/b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch @@ -0,0 +1,208 @@ +commit b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87 +Author: Avadhut Naik +Date: Thu Aug 31 02:23:48 2023 -0500 + + rasdaemon: Fix SMCA bank type decoding + + On AMD systems with Scalable MCA (SMCA), the (HWID, MCATYPE) tuple from + the MCA_IPID MSR, bits 43:32 and 63:48 respectively, are used for SMCA + bank type decoding. On occurrence of an SMCA error, the cached tuples are + compared against the tuple read from the MCA_IPID MSR to determine the + SMCA bank type. + + Currently however, all high 32 bits of the MCA_IPID register are cached in + the rasdaemon for all SMCA bank types. Bits 47:44 which do not play a part + in bank type decoding are zeroed out. Likewise, when an SMCA error occurs, + all high 32 bits of the MCA_IPID register are read and compared against + the cached values in smca_hwid_mcatypes array. + + This can lead to erroneous bank type decoding since the bits 47:44 are + not guaranteed to be zero. They are either reserved or, on some modern + AMD systems viz. Genoa, denote the InstanceIdHi value. The bits therefore, + should not be associated with SMCA bank type decoding. + + Import the HWID_MCATYPE macro from the kernel to ensure that only the + relevant fields i.e. (HWID, MCATYPE) tuples are used for SMCA bank type + decoding on occurrence of an SMCA error. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index a20f03c..55620e2 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -90,6 +90,12 @@ enum smca_bank_types { + /* Maximum number of MCA banks per CPU. */ + #define MAX_NR_BANKS 64 + ++#define MCI_IPID_MCATYPE 0xFFFF0000 ++#define MCI_IPID_HWID 0xFFF ++ ++/* Obtain HWID_MCATYPE Tuple on SMCA Systems */ ++#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) ++ + /* + * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected + * via xGMI links, the NON CPU Nodes are enumerated from index 8 +@@ -699,76 +705,76 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + /* { bank_type, mcatype_hwid } */ + + /* ZN Core (HWID=0xB0) MCA types */ +- { SMCA_LS, 0x000000B0 }, +- { SMCA_LS_V2, 0x001000B0 }, +- { SMCA_IF, 0x000100B0 }, +- { SMCA_L2_CACHE, 0x000200B0 }, +- { SMCA_DE, 0x000300B0 }, ++ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, ++ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, ++ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, ++ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, ++ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ +- { SMCA_EX, 0x000500B0 }, +- { SMCA_FP, 0x000600B0 }, +- { SMCA_L3_CACHE, 0x000700B0 }, ++ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, ++ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, ++ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, + + /* Data Fabric MCA types */ +- { SMCA_CS, 0x0000002E }, +- { SMCA_CS_V2, 0x0002002E }, +- {SMCA_CS_V2_QUIRK, 0x00010000 }, +- { SMCA_PIE, 0x0001002E }, ++ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, ++ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, ++ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, ++ { SMCA_CS_V2_QUIRK, HWID_MCATYPE(0x0, 0x1) }, + + /* Unified Memory Controller MCA type */ +- { SMCA_UMC, 0x00000096 }, +- { SMCA_UMC_QUIRK, 0x00020000 }, ++ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, ++ { SMCA_UMC_QUIRK, HWID_MCATYPE(0x0, 0x2) }, + /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */ +- { SMCA_UMC_V2, 0x00010096 }, ++ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) }, + /* Memory Attached Last Level Cache */ +- { SMCA_MA_LLC, 0x0004002E }, ++ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) }, + + /* Parameter Block MCA type */ +- { SMCA_PB, 0x00000005 }, ++ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, + + /* Platform Security Processor MCA type */ +- { SMCA_PSP, 0x000000FF }, +- { SMCA_PSP_V2, 0x000100FF }, ++ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, ++ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, + + /* System Management Unit MCA type */ +- { SMCA_SMU, 0x00000001 }, +- { SMCA_SMU_V2, 0x00010001 }, ++ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, ++ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, + + /* Microprocessor 5 Unit MCA type */ +- { SMCA_MP5, 0x00020001 }, ++ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, + + /* MPDMA MCA Type */ +- { SMCA_MPDMA, 0x00030001 }, ++ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) }, + + /* Northbridge IO Unit MCA type */ +- { SMCA_NBIO, 0x00000018 }, ++ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, + + /* PCI Express Unit MCA type */ +- { SMCA_PCIE, 0x00000046 }, +- { SMCA_PCIE_V2, 0x00010046 }, ++ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, ++ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, + + /* Ext Global Memory Interconnect PCS MCA type */ +- { SMCA_XGMI_PCS, 0x00000050 }, ++ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, + +- { SMCA_NBIF, 0x0000006C }, ++ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) }, + +- { SMCA_SHUB, 0x00000080 }, +- { SMCA_SATA, 0x000000A8 }, +- { SMCA_USB, 0x000000AA }, ++ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, ++ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, ++ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + + /* Ultra Short Reach Data and Control Plane Controller */ +- { SMCA_USR_DP, 0x00000170 }, +- { SMCA_USR_CP, 0x00000180 }, ++ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) }, ++ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) }, + +- { SMCA_GMI_PCS, 0x00000241 }, ++ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, + + /* Ext Global Memory Interconnect PHY MCA type */ +- { SMCA_XGMI_PHY, 0x00000259 }, ++ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, + + /* WAFL PHY MCA type */ +- { SMCA_WAFL_PHY, 0x00000267 }, ++ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, + +- { SMCA_GMI_PHY, 0x00000269 }, ++ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) }, + }; + + struct smca_bank_name { +@@ -862,12 +868,12 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype) + case 0x10 ... 0x1F: + case 0x60 ... 0x7B: + case 0xA0 ... 0xAF: +- if (*hwid_mcatype == 0x0002002E) +- *hwid_mcatype = 0x00010000; ++ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2)) ++ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1); + break; + case 0x90 ... 0x9F: +- if ((*hwid_mcatype & 0xFF) == 0x00000096) +- *hwid_mcatype = 0x00020000; ++ if (*hwid_mcatype == HWID_MCATYPE(0x96, 0x0)) ++ *hwid_mcatype = HWID_MCATYPE(0x0, 0x2); + break; + default: + break; +@@ -875,8 +881,8 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype) + } else if (m->family == 0x1A) { + switch (m->model) { + case 0x40 ... 0x4F: +- if (*hwid_mcatype == 0x0002002E) +- *hwid_mcatype = 0x00010000; ++ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2)) ++ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1); + break; + default: + break; +@@ -889,13 +895,17 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m) + { + enum smca_bank_types bank_type; + const char *ip_name; ++ uint32_t mcatype_hwid = 0; + unsigned short xec = (e->status >> 16) & 0x3f; + const struct smca_hwid *s_hwid; +- uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63); ++ uint32_t ipid_high = EXTRACT(e->ipid, 32, 63); + uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47); + unsigned int csrow = -1, channel = -1; + unsigned int i; + ++ mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, ++ (ipid_high & MCI_IPID_MCATYPE) >> 16); ++ + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { diff --git a/c785d309dcbdeb7ecd219975244f3944a8d047e9.patch b/c785d309dcbdeb7ecd219975244f3944a8d047e9.patch new file mode 100644 index 0000000..1d8d01e --- /dev/null +++ b/c785d309dcbdeb7ecd219975244f3944a8d047e9.patch @@ -0,0 +1,37 @@ +commit c785d309dcbdeb7ecd219975244f3944a8d047e9 +Author: Muralidhara M K +Date: Thu Jul 27 10:18:12 2023 +0000 + + rasdaemon: Identify the DIe Number in multidie system + + Some AMD systems have 4 dies in each socket and Die ID represents + whether the error occured on cpu die or gpu die. + Also, respective Die used for FRU identification. + + Signed-off-by: Muralidhara M K + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 54060ee..a20f03c 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -935,10 +935,15 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m) + xec); + + if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) { +- channel = find_umc_channel(e); +- csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */ +- mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", +- channel, csrow); ++ if ((m->family == 0x19) && (m->model >= 0x90 && m->model <= 0x9f)) { ++ /* MCA_IPID[InstanceIdHi] give the AMD Node Die ID */ ++ mce_snprintf(e->mc_location, "memory_die_id=%d", mcatype_instancehi / 4); ++ } else { ++ channel = find_umc_channel(e); ++ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */ ++ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", ++ channel, csrow); ++ } + } + + if (bank_type == SMCA_UMC_V2 && xec == 0) { diff --git a/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch b/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch new file mode 100644 index 0000000..88356fb --- /dev/null +++ b/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch @@ -0,0 +1,94 @@ +commit ced615cf8146f51b5d6fe7a29107a2adc77407ca +Author: Sathya Priya Kumar +Date: Thu Jan 11 01:20:07 2024 -0600 + + rasdaemon: Add error decoding for MCA_CTL_SMU extended bits + + Enable error decoding support for the newly added extended + error bit descriptions from MCA_CTL_SMU. + b'0:11 can be decoded from existing array smca_smu2_mce_desc. + Define a function to append the newly defined b'58:62 to the + smca_smu2_mce_desc. This reduces the maintaining Reserved bits + from b'12:57 in the code. + + Signed-off-by: Sathya Priya Kumar + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++- + ras-mce-handler.h | 1 + + 2 files changed, 33 insertions(+), 1 deletion(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400 +@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d + "An ECC or parity error in an SMU RAM instance", + }; + +-static const char * const smca_smu2_mce_desc[] = { ++static const char * smca_smu2_mce_desc[64] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", +@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_ + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", ++ "PHY RAS ECC Error", ++}; ++ ++static const char * smca_smu2_ext_mce_desc[] = { ++ "A correctable error from a GFX Sub-IP", ++ "A fatal error from a GFX Sub-IP", ++ "Reserved", ++ "Reserved", ++ "A poison error from a GFX Sub-IP", + }; + + static const char * const smca_mp5_mce_desc[] = { +@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + ++void smca_smu2_ext_err_desc(void) ++{ ++ int i, j; ++ int smu2_bits = 62; ++ ++ /* ++ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62 ++ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU ++ * ++ * b'0:11 can be decoded from existing array smca_smu2_mce_desc. ++ * b'12:57 are Reserved and b'58:62 are appended to the ++ * smca_smu2_mce_desc. ++ */ ++ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) { ++ for ( ; i < 58; i++) ++ smca_smu2_mce_desc[i] = "Reserved"; ++ ++ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j]; ++ } ++} ++ + void amd_decode_errcode(struct mce_event *e) + { + +@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) & + mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, + (ipid_high & MCI_IPID_MCATYPE) >> 16); + ++ smca_smu2_ext_err_desc(); + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400 +@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy + /* Undertake AMD SMCA Error Decoding */ + void decode_smca_error(struct mce_event *e, struct mce_priv *m); + void amd_decode_errcode(struct mce_event *e); ++void smca_smu2_ext_err_desc(void); + + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); diff --git a/rasdaemon.spec b/rasdaemon.spec index 8818fa1..07eecb5 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,9 +1,8 @@ -%define anolis_release .0.1 Name: rasdaemon Version: 0.6.7 -Release: 10%{?dist} +Release: 15%{?dist} Summary: Utility to receive RAS error tracings -License: GPLv2 +License: GPL-2.0-only URL: http://git.infradead.org/users/mchehab/rasdaemon.git Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2 Patch0: labels.patch @@ -27,21 +26,19 @@ Patch17: 2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch Patch18: 7ccf12f5ae26a055926d175d908c7930293438c4.patch Patch19: 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch Patch20: d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch - -Patch1001: 1001-rasdaemon-Add-notification-support-when-page-goes-of.patch - -Patch2001: 2001-configure.ac-fix-SYSCONFDEFDIR-default-value.patch -Patch2002: 2002-rasdaemon-log-non_standard_event-at-just-one-line.patch -Patch2003: 2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch -Patch2004: 2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch - -Patch3001: 3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch -Patch3002: 3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch -Patch3003: 3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch -Patch3004: 3004-rasdaemon-ensure-trace_clock-file-exist.patch -Patch3005: 3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch -Patch3006: 3006-rasdaemon-disable-ce-offline-default.patch - +Patch21: 30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch +Patch22: aa36c96cd52d775570dae989dd95a060f1149077.patch +Patch23: 932118b04a04104dfac6b8536419803f236e6118.patch +Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch +Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch +Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch +Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch +Patch28: 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch +Patch29: 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch +Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch +Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch +Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch +Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch ExcludeArch: s390 s390x BuildRequires: make @@ -80,26 +77,46 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch1 -p1 %patch2 -p1 %patch3 -p1 -%patch1001 -p1 -%patch2001 -p1 -%patch2002 -p1 -%patch2003 -p1 -%patch2004 -p1 - -%patch3001 -p1 -%patch3002 -p1 -%patch3003 -p1 -%patch3004 -p1 -%patch3005 -p1 -%patch3006 -p1 - +%patch4 -p1 +%patch5 -p1 +%patch6 -p1 +%patch7 -p1 +%patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 +%patch18 -p1 +%patch19 -p1 +%patch20 -p1 +%patch21 -p1 +%patch22 -p1 +%patch23 -p1 +%patch24 -p1 +%patch25 -p1 +%patch26 -p1 +%patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 + +# The tarball is locked in time the first time aclocal was ran and will keep +# requiring an older version of automake autoreconf -vfi %build %ifarch %{arm} aarch64 -%configure --enable-sqlite3 --enable-aer --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-arm --enable-hisi-ns-decode --enable-yitian-ns-decode --enable-memory-ce-pfa +%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-arm --enable-hisi-ns-decode %else -%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-memory-ce-pfa +%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report %endif make %{?_smp_mflags} @@ -112,10 +129,7 @@ mkdir -p %{buildroot}/%{_sharedstatedir}/rasdaemon install -d -p -m 0755 %{buildroot}/%{_sharedstatedir}/rasdaemon mkdir -p %{buildroot}/%{_sysconfdir}/sysconfig install -D -p -m 0644 misc/rasdaemon.env %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon -%ifarch %{arm} aarch64 -install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ -install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ -%endif +sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon %files %doc AUTHORS ChangeLog COPYING README TODO @@ -124,23 +138,27 @@ install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notice %{_mandir}/*/* %{_unitdir}/*.service %{_sharedstatedir}/rasdaemon -%{_sysconfdir}/ras/dimm_labels.d/ -%config(noreplace) %{_sysconfdir}/sysconfig/%{name} -%ifarch %{arm} aarch64 -%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* -%endif -%config(noreplace) %{_sysconfdir}/ras/triggers/* +%{_sysconfdir}/ras/dimm_labels.d +%{_sysconfdir}/sysconfig/rasdaemon %changelog -* Thu Jul 02 2024 Ruidong Tian - 0.6.7-10 -- rasdaemon: add mce and mc trigger -- rasdaemon: AMD mce record just print one line +* Thu Jul 18 2024 Aristeu Rozanski 0.6.7-14 +- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819] + +* Fri Jun 28 2024 Aristeu Rozanski 0.6.7-13 +- rasdaemon: Add error decoding for MCA_CTL_SMU extended bits [RHEL-35718] + +* Thu Jun 20 2024 Aristeu Rozanski 0.6.7-12 +- mce-amd-smca: update smca_hwid to use smca_bank_types [RHEL-24170] + +* Wed May 08 2024 Aristeu Rozanski 0.6.7-11 +- Fix excessive block messages [RHEL-8708] -* Thu Sep 02 2023 Ruidong Tian - 0.6.7-9 -- rasdaemon: add decoder to decode yitian ns error +* Wed Jan 10 2024 Aristeu Rozanski 0.6.7-10 +- Update License string to use SPDX [RHELMISC-1262] -* Fri Jun 02 2023 Bixuan Cui - 0.6.7-8.0.1 -- rasdaemon: add notification support when page goes offline for Memory Corrected Error +* Thu Oct 26 2023 Aristeu Rozanski 0.6.7-9 +- Update SMCA support for AMD processors [RHEL-11092] * Tue May 03 2022 Aristeu Rozanski 0.6.7-8 - Update ras-mc-ctl manpage to match current options [2079132] -- Gitee From 031d7cfec52f822d700a6888d37cff694c191583 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Thu, 12 Jan 2023 14:54:25 +0800 Subject: [PATCH 2/4] feat: rasdaemon: Add notification support when page goes offline for Memory Corrected Error ANBZ: #1782 When the page goes offline, it may affect the user's processes. The user needs to do some special actions (such as restarting the process) before or after going offline. So add page-ce-offline-pre-notice and page-ce-offline-post-notice to env file of rasdaemon for notifying the user when doing page offline. Signed-off-by: Bixuan Cui --- ...tification-support-when-page-goes-of.patch | 222 ++++++++++++++++++ rasdaemon.spec | 19 +- 2 files changed, 238 insertions(+), 3 deletions(-) create mode 100644 1001-rasdaemon-Add-notification-support-when-page-goes-of.patch diff --git a/1001-rasdaemon-Add-notification-support-when-page-goes-of.patch b/1001-rasdaemon-Add-notification-support-when-page-goes-of.patch new file mode 100644 index 0000000..736fea3 --- /dev/null +++ b/1001-rasdaemon-Add-notification-support-when-page-goes-of.patch @@ -0,0 +1,222 @@ +diff -Nur rasdaemon-0.6.7/Makefile.am rasdaemon-0.6.7_new/Makefile.am +--- rasdaemon-0.6.7/Makefile.am 2023-06-02 15:14:06.995338446 +0800 ++++ rasdaemon-0.6.7_new/Makefile.am 2023-06-02 15:14:33.789545754 +0800 +@@ -2,7 +2,7 @@ + SUBDIRS = libtrace util man + SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in + SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) +-EXTRA_DIST = $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env ++EXTRA_DIST = $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env misc/notices + + # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin + # during ./configure phase, therefore it is not possible to add .service.in +diff -Nur rasdaemon-0.6.7/misc/notices/page-ce-offline-post-notice rasdaemon-0.6.7_new/misc/notices/page-ce-offline-post-notice +--- rasdaemon-0.6.7/misc/notices/page-ce-offline-post-notice 1970-01-01 08:00:00.000000000 +0800 ++++ rasdaemon-0.6.7_new/misc/notices/page-ce-offline-post-notice 2023-06-02 15:16:14.456324620 +0800 +@@ -0,0 +1,17 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon after a page goes offline. ++ ++cd `dirname $0` ++ ++[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1 ++ ++if [ -d page-ce-offline-post-notice.extern ] ++then ++ ls page-ce-offline-post-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1 ++ done ++fi ++ ++exit 0 +diff -Nur rasdaemon-0.6.7/misc/notices/page-ce-offline-pre-notice rasdaemon-0.6.7_new/misc/notices/page-ce-offline-pre-notice +--- rasdaemon-0.6.7/misc/notices/page-ce-offline-pre-notice 1970-01-01 08:00:00.000000000 +0800 ++++ rasdaemon-0.6.7_new/misc/notices/page-ce-offline-pre-notice 2023-06-02 15:16:39.440517924 +0800 +@@ -0,0 +1,17 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon before a page goes offline. ++ ++cd `dirname $0` ++ ++[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1 ++ ++if [ -d page-ce-offline-pre-notice.extern ] ++then ++ ls page-ce-offline-pre-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1 ++ done ++fi ++ ++exit 0 +diff -Nur rasdaemon-0.6.7/misc/rasdaemon.env rasdaemon-0.6.7_new/misc/rasdaemon.env +--- rasdaemon-0.6.7/misc/rasdaemon.env 2023-06-02 15:14:06.994338438 +0800 ++++ rasdaemon-0.6.7_new/misc/rasdaemon.env 2023-06-02 15:17:54.307097173 +0800 +@@ -27,3 +27,7 @@ + # soft-then-hard First try to soft offline, then try hard offlining. + # Note: default offline choice is "soft". + PAGE_CE_ACTION="soft" ++ ++# Notices script when doing memory offline ++PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" ++PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" +diff -Nur rasdaemon-0.6.7/misc/rasdaemon.spec.in rasdaemon-0.6.7_new/misc/rasdaemon.spec.in +--- rasdaemon-0.6.7/misc/rasdaemon.spec.in 2023-06-02 15:14:06.994338438 +0800 ++++ rasdaemon-0.6.7_new/misc/rasdaemon.spec.in 2023-06-02 15:19:03.105629470 +0800 +@@ -46,6 +46,8 @@ + make install DESTDIR=%{buildroot} + install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service + install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service ++install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ ++install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -57,6 +59,7 @@ + %{_sysconfdir}/ras/dimm_labels.d + @SYSCONFDEFDIR@/%{name} + %config(noreplace) @SYSCONFDEFDIR@/%{name} ++%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* + + %changelog + +diff -Nur rasdaemon-0.6.7/ras-page-isolation.c rasdaemon-0.6.7_new/ras-page-isolation.c +--- rasdaemon-0.6.7/ras-page-isolation.c 2023-06-02 15:14:06.995338446 +0800 ++++ rasdaemon-0.6.7_new/ras-page-isolation.c 2023-06-02 16:06:28.020663355 +0800 +@@ -17,12 +17,16 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include + #include "ras-logger.h" + #include "ras-page-isolation.h" + ++#define MAX_PATH_LEN 64 + #define PARSED_ENV_LEN 50 + static const struct config threshold_units[] = { + { "m", 1000 }, +@@ -76,6 +80,8 @@ + + static enum otype offline = OFFLINE_SOFT; + static struct rb_root page_records; ++static char pre_notice[MAX_PATH_LEN]; ++static char post_notice[MAX_PATH_LEN]; + + static void page_offline_init(void) + { +@@ -205,16 +211,94 @@ + threshold_string, cycle_string); + } + ++static void page_notice_init(void) ++{ ++ char *notice_root = "/etc/rasdaemon_notices"; ++ char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); ++ char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); ++ ++ if (offline <= OFFLINE_ACCOUNT) ++ return; ++ ++ snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); ++ if (access(pre_notice, R_OK|X_OK) < 0) ++ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", pre_notice); ++ ++ snprintf(post_notice, sizeof(post_notice), "%s/%s", notice_root, post_re); ++ if (access(post_notice, R_OK|X_OK) < 0) ++ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", post_notice); ++} ++ + void ras_page_account_init(void) + { + page_offline_init(); + page_isolation_init(); ++ page_notice_init(); ++} ++ ++static void finish_child(pid_t child, int status) ++{ ++ if (WIFEXITED(status) && WEXITSTATUS(status)) { ++ log(TERM, LOG_INFO, "notice exited with status %d\n", WEXITSTATUS(status)); ++ } else if (WIFSIGNALED(status)) { ++ log(TERM, LOG_INFO,"notice died with signal %s\n", strsignal(WTERMSIG(status))); ++ } ++ ++ return; + } + ++static void __run_notice(char *argv[], char **env) ++{ ++ pid_t child; ++ int status; ++ ++ child = fork(); ++ if (child < 0) { ++ log(TERM, LOG_ERR, "Cannot create process for offline notice"); ++ return; ++ } ++ if (child == 0) { ++ execve(argv[0], argv, env); ++ _exit(127); ++ } ++ else { ++ waitpid(child, &status, 0); ++ finish_child(child, status); ++ } ++} ++ ++static void run_notice(char *argv[]) ++{ ++ int MAX_ENV = 20; ++ char *env[MAX_ENV]; ++ int ei = 0; ++ int i; ++ ++ asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin"); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ __run_notice(argv, env); ++ ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++ } ++ + static int do_page_offline(unsigned long long addr, enum otype type) + { + int fd, rc; + char buf[20]; ++ char *args; ++ char *argv[] = { ++ NULL, ++ NULL, ++ NULL, ++ }; ++ ++ asprintf(&args, "%llu", addr); ++ argv[0] = (char*)&pre_notice; ++ argv[1] = args; ++ run_notice(argv); + + fd = open(kernel_offline[type], O_WRONLY); + if (fd == -1) { +@@ -228,6 +312,12 @@ + log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno); + } + close(fd); ++ ++ argv[0] = (char*)&post_notice; ++ run_notice(argv); ++ ++ free(args); ++ + return rc; + } + diff --git a/rasdaemon.spec b/rasdaemon.spec index 07eecb5..1a5e948 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,7 @@ +%define anolis_release .0.1 Name: rasdaemon Version: 0.6.7 -Release: 15%{?dist} +Release: 15%{anolis_release}%{?dist} Summary: Utility to receive RAS error tracings License: GPL-2.0-only URL: http://git.infradead.org/users/mchehab/rasdaemon.git @@ -40,6 +41,8 @@ Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch +Patch1001: 1001-rasdaemon-Add-notification-support-when-page-goes-of.patch + ExcludeArch: s390 s390x BuildRequires: make BuildRequires: gcc @@ -107,9 +110,8 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch31 -p1 %patch32 -p1 %patch33 -p1 +%patch1001 -p1 -# The tarball is locked in time the first time aclocal was ran and will keep -# requiring an older version of automake autoreconf -vfi %build @@ -130,6 +132,10 @@ install -d -p -m 0755 %{buildroot}/%{_sharedstatedir}/rasdaemon mkdir -p %{buildroot}/%{_sysconfdir}/sysconfig install -D -p -m 0644 misc/rasdaemon.env %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon +%ifarch %{arm} aarch64 +install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ +install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ +%endif %files %doc AUTHORS ChangeLog COPYING README TODO @@ -140,8 +146,15 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir %{_sharedstatedir}/rasdaemon %{_sysconfdir}/ras/dimm_labels.d %{_sysconfdir}/sysconfig/rasdaemon +%ifarch %{arm} aarch64 +%config(noreplace) %{_sysconfdir}/sysconfig/%{name} +%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* +%endif %changelog +* Thu Dec 05 2024 Bixuan Cui - 0.6.7-15.0.1 +- rasdaemon: add notification support when page goes offline for Memory Corrected Error + * Thu Jul 18 2024 Aristeu Rozanski 0.6.7-14 - rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819] -- Gitee From 7419fc541605d62b220c4caa3ff93d790c3f6fd4 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Wed, 13 Sep 2023 11:48:27 +0800 Subject: [PATCH 3/4] spec: add yitian ns decode feature Signed-off-by: Ruidong Tian --- ...-non_standard_event-at-just-one-line.patch | 48 ++ ...pport-for-THead-Yitian-non-standard-.patch | 409 ++++++++++++++++++ ...-ctl-Add-support-to-display-the-THea.patch | 105 +++++ rasdaemon.spec | 12 +- 4 files changed, 572 insertions(+), 2 deletions(-) create mode 100644 2002-rasdaemon-log-non_standard_event-at-just-one-line.patch create mode 100644 2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch create mode 100644 2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch diff --git a/2002-rasdaemon-log-non_standard_event-at-just-one-line.patch b/2002-rasdaemon-log-non_standard_event-at-just-one-line.patch new file mode 100644 index 0000000..1d85b12 --- /dev/null +++ b/2002-rasdaemon-log-non_standard_event-at-just-one-line.patch @@ -0,0 +1,48 @@ +From 9e407134b86f7a176970be70121e08cac6cad3ff Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 7 Sep 2023 18:19:40 +0800 +Subject: [PATCH 2/4] rasdaemon: log non_standard_event at just one line + +It is more reasonable log non_standard_event in one line exclude errors +dump. So you can easily to get decoded non_standard_event log in one +line if you implement a decoder like other event. + +Signed-off-by: Ruidong Tian +--- + ras-non-standard-handler.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 6ccf5bc..7818ed8 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -174,7 +174,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, + case GHES_SEV_PANIC: + ev.severity = "Fatal"; + } +- trace_seq_printf(s, "\n %s", ev.severity); ++ trace_seq_printf(s, " %s", ev.severity); + + ev.sec_type = pevent_get_field_raw(s, event, "sec_type", + record, &len, 1); +@@ -185,7 +185,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, + trace_seq_printf(s, "\n section type: %s", + "Ampere Specific Error\n"); + else +- trace_seq_printf(s, "\n section type: %s", ++ trace_seq_printf(s, " section type: %s", + uuid_le(ev.sec_type)); + ev.fru_text = pevent_get_field_raw(s, event, "fru_text", + record, &len, 1); +@@ -198,7 +198,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, + if (pevent_get_field_val(s, event, "len", record, &val, 1) < 0) + return -1; + ev.length = val; +- trace_seq_printf(s, "\n length: %d\n", ev.length); ++ trace_seq_printf(s, " length: %d", ev.length); + + ev.error = pevent_get_field_raw(s, event, "buf", record, &len, 1); + if(!ev.error) +-- +2.33.1 + diff --git a/2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch b/2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch new file mode 100644 index 0000000..519f4d7 --- /dev/null +++ b/2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch @@ -0,0 +1,409 @@ +From dbc5d5a9ba57ef3f84eb09c9ca658c96219a1736 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 7 Sep 2023 18:21:05 +0800 +Subject: [PATCH 3/4] rasdaemon: add support for THead Yitian non-standard + error decoder + +Add a new non-standard error decoder to decode THead YiTian error +section. Put all related code to a new source file. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 7 +- + configure.ac | 11 ++ + non-standard-yitian.c | 251 ++++++++++++++++++++++++++++++++++++++++++ + non-standard-yitian.h | 73 ++++++++++++ + 4 files changed, 341 insertions(+), 1 deletion(-) + create mode 100644 non-standard-yitian.c + create mode 100644 non-standard-yitian.h + +diff --git a/Makefile.am b/Makefile.am +index fabca78..7cbc81e 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -63,13 +63,18 @@ endif + if WITH_AMP_NS_DECODE + rasdaemon_SOURCES += non-standard-ampere.c + endif ++if WITH_YITIAN_NS_DECODE ++ rasdaemon_SOURCES += non-standard-yitian.c ++endif ++ + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a + + include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ +- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h ++ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ ++ non-standard-yitian.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 33b81fe..a02cca3 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], + AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) + ++AC_ARG_ENABLE([yitian_ns_decode], ++ AS_HELP_STRING([--enable-yitian-ns-decode], [enable YITIAN_NS_DECODE events (currently experimental)])) ++ ++AS_IF([test "x$enable_yitian_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_YITIAN_NS_DECODE,1,"have YITIAN UNKNOWN_SEC events decode") ++ AC_SUBST([WITH_YITIAN_NS_DECODE]) ++]) ++AM_CONDITIONAL([WITH_YITIAN_NS_DECODE], [test x$enable_yitian_ns_decode = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_YITIAN_NS_DECODE], [USE_YITIAN_NS_DECODE="yes"], [USE_YITIAN_NS_DECODE="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -201,4 +211,5 @@ compile time options summary + Memory Failure : $USE_MEMORY_FAILURE + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE ++ YITIAN RAS errors : $USE_YITIAN_NS_DECODE + EOF +diff --git a/non-standard-yitian.c b/non-standard-yitian.c +new file mode 100644 +index 0000000..99cea47 +--- /dev/null ++++ b/non-standard-yitian.c +@@ -0,0 +1,251 @@ ++/* ++ * Copyright (C) 2023 Alibaba Inc ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include "ras-record.h" ++#include "ras-logger.h" ++#include "ras-report.h" ++#include "ras-non-standard-handler.h" ++#include "non-standard-yitian.h" ++ ++static const char * const yitian_ddr_payload_err_reg_name[] = { ++ "Error Type:", ++ "Error SubType:", ++ "Error Instance:", ++ "ECCCFG0:", ++ "ECCCFG1:", ++ "ECCSTAT:", ++ "ECCERRCNT:", ++ "ECCCADDR0:", ++ "ECCCADDR1:", ++ "ECCCSYN0:", ++ "ECCCSYN1:", ++ "ECCCSYN2:", ++ "ECCUADDR0:", ++ "ECCUADDR1:", ++ "ECCUSYN0:", ++ "ECCUSYN1:", ++ "ECCUSYN2:", ++ "ECCBITMASK0:", ++ "ECCBITMASK1:", ++ "ECCBITMASK2:", ++ "ADVECCSTAT:", ++ "ECCAPSTAT:", ++ "ECCCDATA0:", ++ "ECCCDATA1:", ++ "ECCUDATA0:", ++ "ECCUDATA1:", ++ "ECCSYMBOL:", ++ "ECCERRCNTCTL:", ++ "ECCERRCNTSTAT:", ++ "ECCERRCNT0:", ++ "ECCERRCNT1:", ++ "RESERVED0:", ++ "RESERVED1:", ++ "RESERVED2:", ++}; ++ ++struct yitian_ras_type_info { ++ int id; ++ const char *name; ++ const char * const *sub; ++ int sub_num; ++}; ++ ++static const struct yitian_ras_type_info yitian_payload_error_type[] = { ++ { ++ .id = YITIAN_RAS_TYPE_DDR, ++ .name = "DDR", ++ }, ++ { ++ } ++}; ++ ++#ifdef HAVE_SQLITE3 ++static const struct db_fields yitian_ddr_payload_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "address", .type = "INTEGER" }, ++ { .name = "regs_dump", .type = "TEXT" }, ++}; ++ ++static const struct db_table_descriptor yitian_ddr_payload_section_tab = { ++ .name = "yitian_ddr_reg_dump_event", ++ .fields = yitian_ddr_payload_fields, ++ .num_fields = ARRAY_SIZE(yitian_ddr_payload_fields), ++}; ++ ++int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, ++ struct ras_yitian_ddr_payload_event *ev) ++{ ++ int rc; ++ struct sqlite3_stmt *stmt = ev_decoder->stmt_dec_record; ++ ++ log(TERM, LOG_INFO, "yitian_ddr_reg_dump_event store: %p\n", stmt); ++ ++ sqlite3_bind_text (stmt, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int64 (stmt, 2, ev->address); ++ sqlite3_bind_text (stmt, 3, ev->reg_msg, -1, NULL); ++ ++ rc = sqlite3_step(stmt); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do yitian_ddr_reg_dump_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(stmt); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset yitian_ddr_reg_dump_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ ++static const char *oem_type_name(const struct yitian_ras_type_info *info, ++ uint8_t type_id) ++{ ++ const struct yitian_ras_type_info *type = &info[0]; ++ ++ for (; type->name; type++) { ++ if (type->id != type_id) ++ continue; ++ return type->name; ++ } ++ return "unknown"; ++} ++ ++static const char *oem_subtype_name(const struct yitian_ras_type_info *info, ++ uint8_t type_id, uint8_t sub_type_id) ++{ ++ const struct yitian_ras_type_info *type = &info[0]; ++ ++ for (; type->name; type++) { ++ const char * const *submodule = type->sub; ++ ++ if (type->id != type_id) ++ continue; ++ if (type->sub == NULL) ++ return type->name; ++ if (sub_type_id >= type->sub_num) ++ return "unknown"; ++ return submodule[sub_type_id]; ++ } ++ return "unknown"; ++} ++ ++void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, ++ struct trace_seq *s, ++ const struct yitian_ddr_payload_type_sec *err, ++ struct ras_events *ras) ++{ ++ char buf[1024]; ++ char *p = buf; ++ char *end = buf + 1024; ++ int i = 0; ++ const struct yitian_payload_header *header = &err->header; ++ uint32_t *pstart; ++ time_t now; ++ struct tm *tm; ++ struct ras_yitian_ddr_payload_event ev; ++ ++ const char *type_str = oem_type_name(yitian_payload_error_type, ++ header->type); ++ ++ const char *subtype_str = oem_subtype_name(yitian_payload_error_type, ++ header->type, header->subtype); ++ ++#ifdef HAVE_SQLITE3 ++ if (ras->record_events && !ev_decoder->stmt_dec_record) { ++ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, ++ &yitian_ddr_payload_section_tab) != SQLITE_OK) { ++ trace_seq_printf(s, "create sql fail\n"); ++ return; ++ } ++ } ++#endif ++ ++ now = time(NULL); ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ //display error type ++ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); ++ p += snprintf(p, end - p, " %s,", type_str); ++ ++ //display error subtype ++ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); ++ p += snprintf(p, end - p, " %s,", subtype_str); ++ ++ //display error instance ++ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); ++ p += snprintf(p, end - p, " 0x%x,", header->instance); ++ ++ //display reg dump ++ for (pstart = (uint32_t *)&err->ecccfg0; (void *)pstart < (void *)(err + 1); pstart += 1) { ++ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); ++ p += snprintf(p, end - p, " 0x%x ", *pstart); ++ } ++ ++ if (p > buf && p < end) { ++ p--; ++ *p = '\0'; ++ } ++ ++ ev.reg_msg = malloc(p - buf + 1); ++ memcpy(ev.reg_msg, buf, p - buf + 1); ++ ev.address = 0; ++ ++ i = 0; ++ p = NULL; ++ end = NULL; ++ trace_seq_printf(s, "%s\n", buf); ++ ++#ifdef HAVE_SQLITE3 ++ record_yitian_ddr_reg_dump_event(ev_decoder, &ev); ++#endif ++ ++} ++ ++/* error data decoding functions */ ++static int decode_yitian710_ns_error(struct ras_events *ras, ++ struct ras_ns_ev_decoder *ev_decoder, ++ struct trace_seq *s, ++ struct ras_non_standard_event *event) ++{ ++ int payload_type = event->error[0]; ++ ++ if (payload_type == YITIAN_RAS_TYPE_DDR) { ++ const struct yitian_ddr_payload_type_sec *err = ++ (struct yitian_ddr_payload_type_sec *)event->error; ++ decode_yitian_ddr_payload_err_regs(ev_decoder, s, err, ras); ++ } else { ++ trace_seq_printf(s, "%s: wrong payload type\n", __func__); ++ return -1; ++ } ++ return 0; ++} ++ ++struct ras_ns_ev_decoder yitian_ns_oem_decoder[] = { ++ { ++ .sec_type = "a698081116ea4e4db936fb00a23ff29c", ++ .decode = decode_yitian710_ns_error, ++ }, ++}; ++ ++static void __attribute__((constructor)) yitian_ns_init(void) ++{ ++ int i; ++ for (i = 0; i < ARRAY_SIZE(yitian_ns_oem_decoder); i++) ++ register_ns_ev_decoder(&yitian_ns_oem_decoder[i]); ++} +diff --git a/non-standard-yitian.h b/non-standard-yitian.h +new file mode 100644 +index 0000000..b7d6a2d +--- /dev/null ++++ b/non-standard-yitian.h +@@ -0,0 +1,73 @@ ++/* ++ * Copyright (C) 2023 Alibaba Inc ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ */ ++ ++ ++#ifndef __NON_STANDARD_YITIAN_H ++#define __NON_STANDARD_YITIAN_H ++ ++#include "ras-events.h" ++#include "libtrace/event-parse.h" ++ ++#define YITIAN_RAS_TYPE_DDR 0x50 ++ ++struct yitian_payload_header { ++ uint8_t type; ++ uint8_t subtype; ++ uint16_t instance; ++}; ++ ++struct yitian_ddr_payload_type_sec { ++ struct yitian_payload_header header; ++ uint32_t ecccfg0; ++ uint32_t ecccfg1; ++ uint32_t eccstat; ++ uint32_t eccerrcnt; ++ uint32_t ecccaddr0; ++ uint32_t ecccaddr1; ++ uint32_t ecccsyn0; ++ uint32_t ecccsyn1; ++ uint32_t ecccsyn2; ++ uint32_t eccuaddr0; ++ uint32_t eccuaddr1; ++ uint32_t eccusyn0; ++ uint32_t eccusyn1; ++ uint32_t eccusyn2; ++ uint32_t eccbitmask0; ++ uint32_t eccbitmask1; ++ uint32_t eccbitmask2; ++ uint32_t adveccstat; ++ uint32_t eccapstat; ++ uint32_t ecccdata0; ++ uint32_t ecccdata1; ++ uint32_t eccudata0; ++ uint32_t eccudata1; ++ uint32_t eccsymbol; ++ uint32_t eccerrcntctl; ++ uint32_t eccerrcntstat; ++ uint32_t eccerrcnt0; ++ uint32_t eccerrcnt1; ++ uint32_t reserved0; ++ uint32_t reserved1; ++ uint32_t reserved2; ++}; ++ ++struct ras_yitian_ddr_payload_event { ++ char timestamp[64]; ++ unsigned long long address; ++ char *reg_msg; ++}; ++ ++int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, ++ struct ras_yitian_ddr_payload_event *ev); ++void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, ++ struct trace_seq *s, ++ const struct yitian_ddr_payload_type_sec *err, ++ struct ras_events *ras); ++#endif +-- +2.33.1 + diff --git a/2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch b/2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch new file mode 100644 index 0000000..b508066 --- /dev/null +++ b/2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch @@ -0,0 +1,105 @@ +From 2e30517b9584ee8ae99553400168e07afce8ff9c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 7 Sep 2023 18:22:06 +0800 +Subject: [PATCH 4/4] rasdaemon: ras-mc-ctl: Add support to display the THead + vendor errors + +Add support for the THead YiTian DDRC register dump event. + +Signed-off-by: Ruidong Tian +--- + util/ras-mc-ctl.in | 43 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 43 insertions(+) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1e3aeb7..d30fca4 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1528,6 +1528,7 @@ sub errors + use constant { + HISILICON_KUNPENG_920 => "Kunpeng920", + HISILICON_KUNPENG_9XX => "Kunpeng9xx", ++ THEAD_YITIAN_7XX => "YiTian7XX", + }; + + sub vendor_errors_summary +@@ -1536,6 +1537,7 @@ sub vendor_errors_summary + my ($num_args, $platform_id); + my ($query, $query_handle, $count, $out); + my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info); ++ my ($address); + + $num_args = $#ARGV + 1; + $platform_id = 0; +@@ -1628,6 +1630,24 @@ sub vendor_errors_summary + $query_handle->finish; + } + ++ # THead Yitian710 DDR errors ++ if ($platform_id eq THEAD_YITIAN_7XX) { ++ $query = "select address, count(*) from yitian_ddr_reg_dump_event"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($address, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\terrors: $count"; ++ } ++ if ($out ne "") { ++ print "THead YiTian710 DDR error dump events summary:\n$out\n"; ++ } else { ++ print "No THead YiTian710 DDR error dump errors.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + undef($dbh); + } + +@@ -1638,6 +1658,7 @@ sub vendor_errors + my ($query, $query_handle, $id, $timestamp, $out); + my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id); + my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs); ++ my ($address, $regs_dump); + + $num_args = $#ARGV + 1; + $platform_id = 0; +@@ -1743,6 +1764,27 @@ sub vendor_errors + $query_handle->finish; + } + ++ # THead Yitian7xx ddr errors ++ if ($platform_id eq THEAD_YITIAN_7XX) { ++ $query = "select id, timestamp, address, regs_dump from yitian_ddr_reg_dump_event order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $address, $regs_dump)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id. $timestamp "; ++ $out .= "Error Address: $address "; ++ $out .= "Error Registers Dump: $regs_dump" if ($regs_dump); ++ $out .= "\n\n"; ++ } ++ if ($out ne "") { ++ print "THead Yitian710 DDRC error events:\n$out\n"; ++ } else { ++ print "No THead Yitian710 DDRC error events.\n"; ++ } ++ $query_handle->finish; ++ } ++ + undef($dbh); + } + +@@ -1751,6 +1793,7 @@ sub vendor_platforms + print "\nSupported platforms for the vendor-specific errors:\n"; + print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n"; + print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; ++ print "\tTHead Yitian7xx, platform-id=\"", THEAD_YITIAN_7XX, "\"\n"; + print "\n"; + } + +-- +2.33.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 1a5e948..c21cc6b 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -43,6 +43,10 @@ Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch Patch1001: 1001-rasdaemon-Add-notification-support-when-page-goes-of.patch +Patch2002: 2002-rasdaemon-log-non_standard_event-at-just-one-line.patch +Patch2003: 2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch +Patch2004: 2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch + ExcludeArch: s390 s390x BuildRequires: make BuildRequires: gcc @@ -111,14 +115,17 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch32 -p1 %patch33 -p1 %patch1001 -p1 +%patch2002 -p1 +%patch2003 -p1 +%patch2004 -p1 autoreconf -vfi %build %ifarch %{arm} aarch64 -%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-arm --enable-hisi-ns-decode +%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-arm --enable-hisi-ns-decode --enable-yitian-ns-decode %else -%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report +%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure %endif make %{?_smp_mflags} @@ -154,6 +161,7 @@ install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notice %changelog * Thu Dec 05 2024 Bixuan Cui - 0.6.7-15.0.1 - rasdaemon: add notification support when page goes offline for Memory Corrected Error +- rasdaemon: add decoder to decode yitian ns error (tianruidong@linux.alibaba.com) * Thu Jul 18 2024 Aristeu Rozanski 0.6.7-14 - rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819] -- Gitee From ca9d2a9f25173115a7afb71d29f44dbac003cfa6 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Tue, 2 Jul 2024 17:19:11 +0800 Subject: [PATCH 4/4] update rasdaemon - add mc and mce trigger - enable memory ce fauilre - AMD mce record print one line - disable amp record default Signed-off-by: Ruidong Tian --- ...-add-mc_event-and-mce_record-trigger.patch | 660 ++++++++++++++++++ ...rocess-Ampere-specific-error-in-the-.patch | 104 +++ ...e-issue-of-sprintf-data-type-mismatc.patch | 56 ++ ...daemon-ensure-trace_clock-file-exist.patch | 54 ++ ...rasdaemon-disable-ce-offline-default.patch | 48 ++ rasdaemon.spec | 26 +- 6 files changed, 942 insertions(+), 6 deletions(-) create mode 100644 3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch create mode 100644 3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch create mode 100644 3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch create mode 100644 3004-rasdaemon-ensure-trace_clock-file-exist.patch create mode 100644 3006-rasdaemon-disable-ce-offline-default.patch diff --git a/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch b/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch new file mode 100644 index 0000000..908333a --- /dev/null +++ b/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch @@ -0,0 +1,660 @@ +From 0fd49ba8f1af285c7f607b3c8a669942631fd259 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 7 Jun 2024 11:26:06 +0800 +Subject: [PATCH 1/6] rasdaemon: add mc_event and mce_record trigger + +Allow users to run a trigger when mc_event and mce_record occurs, The +trigger is separated into CE trigger and UE trigger, this is because +CE is more frequent than UE, and the CE trigger will lead to more +performance hits. Users can choose different triggers for CE/UE to +reduce this effect. + +To prevent triggering hangs or consuming excessive time, there is a +default timeout of 1s, trigger will be killed if timeout, user can +modify timeout by setting environment *_TIMEOUT or delete timeout by +setting *_TIMEOUT to 0. + +Environment of trigger in /etc/sysconfig/rasdaemon: + +TRIGGER_DIR: The trigger diretory + +MC_CE_TRIGGER: The script executed when corrected mc_event occurs. +MC_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_CE_TRIGGER, set 0 to +delete timeout. +MC_UE_TRIGGER: The script executed when uncorrected mc_event occurs. +MC_UE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_UE_TRIGGER, set 0 to +delete timeout. + +MCE_CE_TRIGGER: The script executed when corrected mce_record occurs. +MCE_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MCE_CE_TRIGGER, set 0 to +delete timeout. +MCE_UE_TRIGGER: The script executed when uncorrected mce_record occurs. +MCE_UE_TRIGGER_TIMEOUT: Timeout(seconds) for MCE_UE_TRIGGER, set 0 to +delete timeout. + +No script will be executed if *_CE_TRIGGER/*_UE_TRIGGER is null. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 8 +- + contrib/mc_event_trigger | 24 +++++ + contrib/mce_record_trigger | 36 ++++++++ + misc/rasdaemon.env | 31 +++++++ + ras-events.c | 3 + + ras-mc-handler.c | 63 +++++++++++++ + ras-mce-handler.c | 89 ++++++++++++++++++ + trigger.c | 184 +++++++++++++++++++++++++++++++++++++ + trigger.h | 29 ++++++ + 9 files changed, 463 insertions(+), 4 deletions(-) + create mode 100755 contrib/mc_event_trigger + create mode 100755 contrib/mce_record_trigger + create mode 100644 trigger.c + create mode 100644 trigger.h + +diff --git a/Makefile.am b/Makefile.am +index f410c6d..2e4fe39 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -17,7 +17,7 @@ all-local: $(SYSTEMD_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +- bitfield.c ++ bitfield.c trigger.c + if WITH_SQLITE3 + rasdaemon_SOURCES += ras-record.c + endif +@@ -74,7 +74,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ +- non-standard-yitian.h ++ non-standard-yitian.h trigger.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +@@ -101,6 +101,6 @@ upload: + # custom target + install-data-local: + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" +-if WITH_MEMORY_CE_PFA ++ $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" + $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" +-endif ++ $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" +diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger +new file mode 100755 +index 0000000..5c6ccfa +--- /dev/null ++++ b/contrib/mc_event_trigger +@@ -0,0 +1,24 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon in daemon mode when a ++# mc_event is occured, environment variables include all information ++# reported by tracepoint. ++# ++# environment: ++# TIMESTAMP Timestamp when error occurred ++# COUNT Number of errors of the same type ++# TYPE Error type from Corrected/Uncorrected ++# MESSAGE Error message ++# LABEL Label of the affected DIMM(s) ++# MC_INDEX DIMM identifier from DMI/SMBIOS if available ++# TOP_LAYER Top layer of the error ++# MIDDLE_LAYER Middle layer of the error ++# LOWER_LAYER Low layer of the error ++# ADDRESS Error address ++# GRAIN Minimum granularity for an error report, in bytes ++# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable) ++# DRIVER_DETAIL Other driver-specific detail about the error ++# ++ ++[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local ++ ++exit 0 +diff --git a/contrib/mce_record_trigger b/contrib/mce_record_trigger +new file mode 100755 +index 0000000..06a52d9 +--- /dev/null ++++ b/contrib/mce_record_trigger +@@ -0,0 +1,36 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon in daemon mode when a ++# mc_event is occured, environment variables include all information ++# reported by tracepoint. ++# ++# environment: ++# MCGCAP MCGCAP MSR: machine check capabilities of CPU ++# MCGSTATUS Machine Check Global Status MSR ++# STATUS Bank's MCi_STATUS MSR ++# ADDR Bank's MCi_ADDR MSR ++# MISC Bank's MCi_MISC MSR ++# IP Instruction Pointer when the error happened ++# TSC CPU time stamp counter ++# WALLTIME Wall time_t when error was detected ++# CPU CPU number; obsoleted by extcpu ++# CPUID CPUID 1 EAX ++# APICID CPU initial APIC ID ++# SOCKETID CPU socket ID ++# CS Code segment ++# BANK Machine check bank reporting the error ++# CPUVENDOR Kernel's X86_VENDOR enum ++# SYND MCA_SYND MSR: only valid on SMCA systems ++# IPID MCA_IPID MSR: only valid on SMCA systems ++# TIMESTAMP Rasdaemon timestamp ++# BANK_NAME Decode ban name ++# ERROR_MSG Vendor define error message ++# MCGSTATUS_MSG Decode mcgstatus ++# MCISTATUS_MSG Decode mcistatus ++# MCASTATUS_MSG Decode mcastatus ++# USER_ACTION Recommendations for actions users should take ++# MC_LOCATION Error location in MC ++# ++ ++[ -x ./mce_record_trigger.local ] && . ./mce_record_trigger.local ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 713875a..9f8e606 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -31,3 +31,34 @@ PAGE_CE_ACTION="soft" + # Notices script when doing memory offline + PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" + PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" ++ ++# Event Trigger ++ ++# Event trigger will be executed when the specified event occurs. ++# ++# Execute triggers path ++# For example: TRIGGER_DIR=/etc/ras/triggers ++TRIGGER_DIR= ++ ++# Execute these triggers when the mc_event occured, the triggers will not ++# be executed if the trigger is not specified. ++# You can set timeout for trigger, trigger thread will be killed if timeout. ++# The default timeout is 1, if you do not want any timeout, set it to 0. ++# For example: ++# MC_CE_TRIGGER=mc_event_trigger ++# MC_UE_TRIGGER=mc_event_trigger ++# MC_CE_TRIGGER_TIMEOUT=1 ++# MC_UE_TRIGGER_TIMEOUT=1 ++ ++# trigger for mc_event ++MC_CE_TRIGGER= ++MC_UE_TRIGGER= ++MC_CE_TRIGGER_TIMEOUT=0 ++MC_UE_TRIGGER_TIMEOUT=0 ++ ++# trigger for mce_record ++MCE_CE_TRIGGER= ++MCE_UE_TRIGGER= ++MCE_CE_TRIGGER_TIMEOUT=0 ++MCE_UE_TRIGGER_TIMEOUT=0 ++ +diff --git a/ras-events.c b/ras-events.c +index fe4bd26..016f531 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -41,6 +41,7 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" ++#include "trigger.h" + + /* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never +@@ -815,6 +816,8 @@ int handle_ras_events(int record_events) + ras_page_account_init(); + #endif + ++ trigger_setup(); ++ + rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", + ras_mc_event_handler, NULL, MC_EVENT); + if (!rc) +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index 42b05cd..0081d95 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -15,16 +15,73 @@ + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ ++#define _GNU_SOURCE + #include + #include + #include + #include + #include "libtrace/kbuffer.h" ++#include + #include "ras-mc-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" + #include "ras-report.h" ++#include "trigger.h" ++ ++struct event_trigger mc_ce_trigger = {"mc_event", "MC_CE_TRIGGER"}; ++struct event_trigger mc_ue_trigger = {"mc_event", "MC_UE_TRIGGER"}; ++ ++static void run_mc_trigger(struct ras_mc_event *ev, ++ struct event_trigger *trigger, ++ struct trace_seq *s) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0, i; ++ char msg[4096]; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) ++ goto free; ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env, "mc_event", msg); ++ ++ trace_seq_printf(s, " %s", msg); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} + + int ras_mc_event_handler(struct trace_seq *s, + struct pevent_record *record, +@@ -195,6 +252,12 @@ int ras_mc_event_handler(struct trace_seq *s, + ras_report_mc_event(ras, &ev); + #endif + ++ if (!strcmp(ev.error_type, "Corrected")) ++ run_mc_trigger(&ev, &mc_ce_trigger, s); ++ ++ if (!strcmp(ev.error_type, "Uncorrected")) ++ run_mc_trigger(&ev, &mc_ue_trigger, s); ++ + return 0; + + parse_error: +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 805004a..ac2c4a1 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -15,6 +15,7 @@ + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ ++#define _GNU_SOURCE + #include + #include + #include +@@ -22,11 +23,13 @@ + #include + #include + #include ++#include + #include "libtrace/kbuffer.h" + #include "ras-mce-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" ++#include "trigger.h" + + /* + * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, +@@ -233,6 +236,85 @@ ret: + return ret; + } + ++struct event_trigger mce_ce_trigger = {"mce_record", "MCE_CE_TRIGGER"}; ++struct event_trigger mce_de_trigger = {"mce_record", "MCE_DE_TRIGGER"}; ++struct event_trigger mce_ue_trigger = {"mce_record", "MCE_UE_TRIGGER"}; ++ ++static void run_mce_trigger(struct mce_event *e, ++ struct event_trigger *trigger, ++ struct trace_seq *s) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0, i; ++ char msg[4096]; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGCAP=%#lx", e->mcgcap) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGSTATUS=%#lx", e->mcgstatus) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "STATUS=%#lx", e->status) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ADDR=%#lx", e->addr) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MISC=%#lx", e->misc) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "IP=%#lx", e->ip) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TSC=%#lx", e->tsc) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "WALLTIME=%#lx", e->walltime) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPU=%#x", e->cpu) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPUID=%#x", e->cpuid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "APICID=%#x", e->apicid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SOCKETID=%#x", e->socketid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CS=%#x", e->cs) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BANK=%#x", e->bank) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPUVENDOR=%#x", e->cpuvendor) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SYND=%#lx", e->synd) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "IPID=%#lx", e->ipid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", e->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BANK_NAME=%s", e->bank_name) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ERROR_MSG=%s", e->error_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGSTATUS_MSG=%s", e->mcgstatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCISTATUS_MSG=%s", e->mcistatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCASTATUS_MSG=%s", e->mcastatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "USER_ACTION=%s", e->user_action) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MC_LOCATION=%s", e->mc_location) < 0) ++ goto free; ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env, "mce_record", msg); ++ ++ trace_seq_printf(s, " %s", msg); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} ++ + int register_mce_handler(struct ras_events *ras, unsigned ncpus) + { + int rc; +@@ -480,5 +562,12 @@ int ras_mce_event_handler(struct trace_seq *s, + ras_report_mce_event(ras, &e); + #endif + ++ if (e.status & MCI_STATUS_UC) ++ run_mce_trigger(&e, &mce_ue_trigger, s); ++ else if (e.status & MCI_STATUS_DEFERRED) ++ run_mce_trigger(&e, &mce_de_trigger, s); ++ else ++ run_mce_trigger(&e, &mce_ce_trigger, s); ++ + return 0; + } +diff --git a/trigger.c b/trigger.c +new file mode 100644 +index 0000000..8716f50 +--- /dev/null ++++ b/trigger.c +@@ -0,0 +1,184 @@ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include "ras-logger.h" ++#include "trigger.h" ++#include ++ ++#define READ 0 ++#define WRITE 1 ++ ++static int child_done, alarm_done; ++static char *trigger_dir; ++ ++static void child_handler(int sig) ++{ ++ child_done = 1; ++} ++ ++static void alarm_handler(int sig) ++{ ++ alarm_done = 1; ++} ++ ++void run_trigger(struct event_trigger *t, char *argv[], char **env, ++ const char* reporter, char *msg) ++{ ++ pid_t child; ++ char *path, err[256] = {0}, *trigger = t->path; ++ int status, pipe_stdout[2], pipe_stderr[2], timeout = t->timeout; ++ ssize_t byte = 0; ++ ++ log(TERM, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); ++ ++ if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0) ++ return; ++ ++ if (pipe(pipe_stdout) == -1) ++ exit(EXIT_FAILURE); ++ ++ if (pipe(pipe_stderr) == -1) ++ exit(EXIT_FAILURE); ++ ++ child = fork(); ++ if (child < 0) { ++ log(TERM, LOG_ERR, "Cannot create process for trigger"); ++ return; ++ } else if (child == 0) { ++ close(pipe_stdout[READ]); ++ close(pipe_stderr[READ]); ++ dup2(pipe_stdout[WRITE], 1); ++ dup2(pipe_stderr[WRITE], 2); ++ close(pipe_stdout[WRITE]); ++ close(pipe_stderr[WRITE]); ++ ++ execve(path, argv, env); ++ exit(EXIT_FAILURE); ++ } ++ ++ signal(SIGCHLD, child_handler); ++ ++ close(pipe_stdout[WRITE]); ++ close(pipe_stderr[WRITE]); ++ ++ if (timeout) { ++ signal(SIGALRM, alarm_handler); ++ alarm(timeout); ++ } ++ ++ pause(); ++ ++ if (child_done) { ++ if (waitpid(child, &status, WNOHANG) == child){ ++ if (WIFEXITED(status) && WEXITSTATUS(status)) ++ log(TERM, LOG_INFO, "Trigger %s exited with status %d\n", ++ trigger, WEXITSTATUS(status)); ++ else if (WIFSIGNALED(status)) ++ log(TERM, LOG_INFO, "Trigger %s killed by signal %d\n", ++ trigger, WTERMSIG(status)); ++ } ++ alarm(0); ++ } else if (alarm_done) { ++ log(TERM, LOG_ERR, "Trigger timeout, kill it\n"); ++ kill(child, SIGKILL); ++ } ++ signal(SIGCHLD, SIG_DFL); ++ signal(SIGALRM, SIG_DFL); ++ ++ byte = read(pipe_stderr[READ], err, 256); ++ if (byte > 0) ++ log(TERM, LOG_ERR, "Trigger stderr: %s\n", err); ++ else if (byte < 0) ++ log(TERM, LOG_ERR, "Trigger error : %s\n", strerror(byte)); ++ ++ byte = read(pipe_stdout[READ], msg, 4096); ++ if (byte < 0) ++ log(TERM, LOG_ERR, "Trigger error : %s\n", strerror(byte)); ++ ++ close(pipe_stdout[READ]); ++ close(pipe_stderr[READ]); ++} ++ ++int trigger_check(char *s) ++{ ++ char *name; ++ int rc; ++ ++ if (trigger_dir) { ++ if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) ++ return -1; ++ } else ++ name = s; ++ ++ rc = access(name, R_OK|X_OK); ++ ++ if (trigger_dir) ++ free(name); ++ ++ return rc; ++} ++ ++static struct event_trigger *event_triggers[] = { ++ &mc_ce_trigger, ++ &mc_ue_trigger, ++#ifdef HAVE_MCE ++ &mce_ce_trigger, ++ &mce_de_trigger, ++ &mce_ue_trigger, ++#endif ++}; ++ ++void trigger_setup(void) ++{ ++ int i, j; ++ struct event_trigger *trigger; ++ char *s, timeout_env[30]; ++ ++ trigger_dir = getenv("TRIGGER_DIR"); ++ ++ for (i = 0; i < ARRAY_SIZE(event_triggers); i++) { ++ trigger = event_triggers[i]; ++ ++ s = getenv(trigger->env); ++ if (!s || !strcmp(s, "")) ++ continue; ++ ++ trigger->path = s; ++ if (trigger_check(s) < 0) ++ log(SYSLOG, LOG_ERR, "Cannot access trigger `%s`\n", s); ++ else ++ log(SYSLOG, LOG_NOTICE, "Setup %s trigger `%s`\n", ++ trigger->event_name, s); ++ ++ sprintf(timeout_env, "%s_TIMEOUT", trigger->env); ++ ++ trigger->timeout = 1; ++ s = getenv(timeout_env); ++ if (!s || !strcmp(s, "")) { ++ log(SYSLOG, LOG_NOTICE, ++ "Setup %s trigger default timeout 1s", ++ trigger->event_name); ++ continue; ++ } ++ ++ j = atoi(s); ++ if (j < 0) ++ log(SYSLOG, LOG_ERR, ++ "Invalid %s trigger timeout `%d`" ++ "use default value: 1s\n", ++ trigger->event_name, j); ++ else if (j == 0) { ++ log(SYSLOG, LOG_NOTICE, ++ "%s trigger no timeout\n", trigger->event_name); ++ trigger->timeout = 0; ++ } else { ++ log(SYSLOG, LOG_NOTICE, ++ "Setup %s trigger timeout `%d`s\n", ++ trigger->event_name, j); ++ trigger->timeout = j; ++ } ++ } ++} +diff --git a/trigger.h b/trigger.h +new file mode 100644 +index 0000000..8a6e380 +--- /dev/null ++++ b/trigger.h +@@ -0,0 +1,29 @@ ++#ifndef __TRIGGER_H__ ++#define __TRIGGER_H__ ++ ++#include "config.h" ++ ++#define MAX_ENV 30 ++#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) ++ ++struct event_trigger { ++ const char *event_name; ++ const char *env; ++ char *path; ++ int timeout; ++}; ++ ++int trigger_check(char *s); ++void run_trigger(struct event_trigger *t, char *argv[], char **env, ++ const char* reporter, char *msg); ++void trigger_setup(void); ++ ++extern struct event_trigger mc_ce_trigger; ++extern struct event_trigger mc_ue_trigger; ++#ifdef HAVE_MCE ++extern struct event_trigger mce_ce_trigger; ++extern struct event_trigger mce_de_trigger; ++extern struct event_trigger mce_ue_trigger; ++#endif ++ ++#endif +-- +2.33.1 + diff --git a/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch b/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch new file mode 100644 index 0000000..e2af5ee --- /dev/null +++ b/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch @@ -0,0 +1,104 @@ +From 248531d736be425ea1a767def8176e04bac3d819 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 12 Dec 2023 10:46:11 +0800 +Subject: [PATCH 2/6] rasdaemon: Do't process Ampere specific error in the + public code + +Ampere specific error info and error handler need to included in +HAVE_AMP_NS_DECODE macro. + +Signed-off-by: Ruidong Tian +--- + ras-arm-handler.c | 7 +++---- + ras-record.c | 4 ++++ + ras-record.h | 2 ++ + 3 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 1149dc6..d81daec 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -51,7 +51,6 @@ int ras_arm_event_handler(struct trace_seq *s, + time_t now; + struct tm *tm; + struct ras_arm_event ev; +- int len = 0; + memset(&ev, 0, sizeof(ev)); + + /* +@@ -99,6 +98,9 @@ int ras_arm_event_handler(struct trace_seq *s, + ev.psci_state = val; + trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); + ++#ifdef HAVE_AMP_NS_DECODE ++ int len = 0; ++ + if (pevent_get_field_val(s, event, "pei_len", record, &val, 1) < 0) + return -1; + ev.pei_len = val; +@@ -131,12 +133,9 @@ int ras_arm_event_handler(struct trace_seq *s, + if (!ev.vsei_error) + return -1; + +-#ifdef HAVE_AMP_NS_DECODE + //decode ampere specific error + decode_amp_payload0_err_regs(NULL, s, + (struct amp_payload0_type_sec *)ev.vsei_error); +-#else +- display_raw_data(s, ev.vsei_error, ev.oem_len); + #endif + + /* Insert data into the SGBD */ +diff --git a/ras-record.c b/ras-record.c +index d845f81..04ad094 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -212,9 +212,11 @@ static const struct db_fields arm_event_fields[] = { + { .name="mpidr", .type="INTEGER" }, + { .name="running_state", .type="INTEGER" }, + { .name="psci_state", .type="INTEGER" }, ++#ifdef HAVE_AMP_NS_DECODE + { .name="err_info", .type="BLOB" }, + { .name="context_info", .type="BLOB" }, + { .name="vendor_info", .type="BLOB" }, ++#endif + }; + + static const struct db_table_descriptor arm_event_tab = { +@@ -238,12 +240,14 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) + sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr); + sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state); + sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state); ++#ifdef HAVE_AMP_NS_DECODE + sqlite3_bind_blob (priv->stmt_arm_record, 7, + ev->pei_error, ev->pei_len, NULL); + sqlite3_bind_blob (priv->stmt_arm_record, 8, + ev->ctx_error, ev->ctx_len, NULL); + sqlite3_bind_blob (priv->stmt_arm_record, 9, + ev->vsei_error, ev->oem_len, NULL); ++#endif + + rc = sqlite3_step(priv->stmt_arm_record); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +diff --git a/ras-record.h b/ras-record.h +index d9f7733..86678b2 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -77,12 +77,14 @@ struct ras_arm_event { + int64_t midr; + int32_t running_state; + int32_t psci_state; ++#ifdef HAVE_AMP_NS_DECODE + const uint8_t *pei_error; + uint32_t pei_len; + const uint8_t *ctx_error; + uint32_t ctx_len; + const uint8_t *vsei_error; + uint32_t oem_len; ++#endif + }; + + struct devlink_event { +-- +2.33.1 + diff --git a/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch b/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch new file mode 100644 index 0000000..229e0c4 --- /dev/null +++ b/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch @@ -0,0 +1,56 @@ +From dce53f6809c4fdab967ecc78f80c8ec2ebd89aca Mon Sep 17 00:00:00 2001 +From: Xiaofei Tan +Date: Wed, 20 Oct 2021 14:33:37 +0800 +Subject: [PATCH 3/6] rasdaemon: Fix the issue of sprintf data type mismatch in + uuid_le() + +The data type of sprintf called in the function uuid_le() is mismatch. +Arm64 compiler force it to unsigned char by default, and can work normally. +But if someone compile it with the option -fsigned-char, the function +can't work correctly. + +Signed-off-by: Xiaofei Tan +Signed-off-by: Mauro Carvalho Chehab +--- + ras-extlog-handler.c | 2 +- + ras-non-standard-handler.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c +index 5fd3580..1834687 100644 +--- a/ras-extlog-handler.c ++++ b/ras-extlog-handler.c +@@ -152,7 +152,7 @@ static char *uuid_le(const char *uu) + static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; + + for (i = 0; i < 16; i++) { +- p += sprintf(p, "%.2x", uu[le[i]]); ++ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); + switch (i) { + case 3: + case 5: +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 7818ed8..86178bf 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -36,7 +36,7 @@ static char *uuid_le(const char *uu) + static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; + + for (i = 0; i < 16; i++) { +- p += sprintf(p, "%.2x", uu[le[i]]); ++ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); + switch (i) { + case 3: + case 5: +@@ -61,7 +61,7 @@ static int uuid_le_cmp(const char *sec_type, const char *uuid2) + 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; + + for (i = 0; i < 16; i++) +- p += sprintf(p, "%.2x", sec_type[le[i]]); ++ p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); + *p = 0; + return strncmp(uuid1, uuid2, 32); + } +-- +2.33.1 + diff --git a/3004-rasdaemon-ensure-trace_clock-file-exist.patch b/3004-rasdaemon-ensure-trace_clock-file-exist.patch new file mode 100644 index 0000000..eed9c40 --- /dev/null +++ b/3004-rasdaemon-ensure-trace_clock-file-exist.patch @@ -0,0 +1,54 @@ +From 5cfecb69e04d964d4f71f4ccd2a6ce1fc2690f78 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 30 May 2024 19:13:21 +0800 +Subject: [PATCH 4/6] rasdaemon: ensure trace_clock file exist + +Fix https://github.com/mchehab/rasdaemon/issues/74 + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +diff --git a/ras-events.c b/ras-events.c +index 016f531..544c418 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -98,6 +98,18 @@ static int get_debugfs_dir(char *tracing_dir, size_t len) + return ENOENT; + } + ++static int stat_trace(struct ras_events *ras, char *name) ++{ ++ char fname[MAX_PATH + 1]; ++ struct stat file_info; ++ ++ strcpy(fname, ras->tracing); ++ strcat(fname, "/"); ++ strcat(fname, name); ++ ++ return stat(fname, &file_info); ++} ++ + static int open_trace(struct ras_events *ras, char *name, int flags) + { + char fname[MAX_PATH + 1]; +@@ -619,12 +631,14 @@ static void *handle_ras_events_cpu(void *priv) + static int select_tracing_timestamp(struct ras_events *ras) + { + FILE *fp; +- int fd, rc; ++ int fd, rc, retry = 10; + time_t uptime, now; + size_t size; + unsigned j1; + char buf[4096]; + ++ while (stat_trace(ras, "trace_clock") && retry--); ++ + /* Check if uptime is supported (kernel 3.10-rc1 or upper) */ + fd = open_trace(ras, "trace_clock", O_RDONLY); + if (fd < 0) { +-- +2.33.1 + diff --git a/3006-rasdaemon-disable-ce-offline-default.patch b/3006-rasdaemon-disable-ce-offline-default.patch new file mode 100644 index 0000000..6fec9fd --- /dev/null +++ b/3006-rasdaemon-disable-ce-offline-default.patch @@ -0,0 +1,48 @@ +From ab8f363f4ffcbc49bf700ca0199ff2b8f9bba65a Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 28 Jun 2024 10:06:40 +0800 +Subject: [PATCH] rasdaemon: disable ce offline default + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 8 +++++--- + ras-page-isolation.c | 2 +- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 9f8e606..1b5403c 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -26,11 +26,13 @@ PAGE_CE_THRESHOLD="50" + # Requires an uptodate kernel. Might not be successfull. + # soft-then-hard First try to soft offline, then try hard offlining. + # Note: default offline choice is "soft". +-PAGE_CE_ACTION="soft" ++PAGE_CE_ACTION="off" + + # Notices script when doing memory offline +-PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" +-PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" ++# PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" ++# PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" ++PAGE_CE_OFFLINE_PRE_NOTICE="" ++PAGE_CE_OFFLINE_POST_NOTICE="" + + # Event Trigger + +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 193d47c..3c777e6 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -217,7 +217,7 @@ static void page_notice_init(void) + char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); + char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); + +- if (offline <= OFFLINE_ACCOUNT) ++ if (offline <= OFFLINE_ACCOUNT || !pre_re || !post_re) + return; + + snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); +-- +2.33.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index c21cc6b..f4d29bc 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -47,6 +47,13 @@ Patch2002: 2002-rasdaemon-log-non_standard_event-at-just-one-line.patch Patch2003: 2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch Patch2004: 2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch +Patch3001: 3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch +Patch3002: 3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch +Patch3003: 3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch +Patch3004: 3004-rasdaemon-ensure-trace_clock-file-exist.patch +Patch3006: 3006-rasdaemon-disable-ce-offline-default.patch + + ExcludeArch: s390 s390x BuildRequires: make BuildRequires: gcc @@ -119,13 +126,19 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch2003 -p1 %patch2004 -p1 +%patch3001 -p1 +%patch3002 -p1 +%patch3003 -p1 +%patch3004 -p1 +%patch3006 -p1 + autoreconf -vfi %build %ifarch %{arm} aarch64 -%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-arm --enable-hisi-ns-decode --enable-yitian-ns-decode +%configure --enable-sqlite3 --enable-aer --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-arm --enable-hisi-ns-decode --enable-yitian-ns-decode --enable-memory-ce-pfa %else -%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure +%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-memory-ce-pfa %endif make %{?_smp_mflags} @@ -138,7 +151,6 @@ mkdir -p %{buildroot}/%{_sharedstatedir}/rasdaemon install -d -p -m 0755 %{buildroot}/%{_sharedstatedir}/rasdaemon mkdir -p %{buildroot}/%{_sysconfdir}/sysconfig install -D -p -m 0644 misc/rasdaemon.env %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon -sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon %ifarch %{arm} aarch64 install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ @@ -151,17 +163,19 @@ install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notice %{_mandir}/*/* %{_unitdir}/*.service %{_sharedstatedir}/rasdaemon -%{_sysconfdir}/ras/dimm_labels.d -%{_sysconfdir}/sysconfig/rasdaemon -%ifarch %{arm} aarch64 +%{_sysconfdir}/ras/dimm_labels.d/ %config(noreplace) %{_sysconfdir}/sysconfig/%{name} +%ifarch %{arm} aarch64 %config(noreplace) %{_sysconfdir}/rasdaemon_notices/* %endif +%config(noreplace) %{_sysconfdir}/ras/triggers/* %changelog * Thu Dec 05 2024 Bixuan Cui - 0.6.7-15.0.1 - rasdaemon: add notification support when page goes offline for Memory Corrected Error - rasdaemon: add decoder to decode yitian ns error (tianruidong@linux.alibaba.com) +- rasdaemon: add mce and mc trigger (tianruidong@linux.alibaba.com) +- rasdaemon: AMD mce record just print one line (tianruidong@linux.alibaba.com) * Thu Jul 18 2024 Aristeu Rozanski 0.6.7-14 - rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819] -- Gitee