diff --git a/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch b/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch new file mode 100644 index 0000000000000000000000000000000000000000..908333a04dc67404fe1fa8b989f4165709a4e758 --- /dev/null +++ b/3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch @@ -0,0 +1,660 @@ +From 0fd49ba8f1af285c7f607b3c8a669942631fd259 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 7 Jun 2024 11:26:06 +0800 +Subject: [PATCH 1/6] rasdaemon: add mc_event and mce_record trigger + +Allow users to run a trigger when mc_event and mce_record occurs, The +trigger is separated into CE trigger and UE trigger, this is because +CE is more frequent than UE, and the CE trigger will lead to more +performance hits. Users can choose different triggers for CE/UE to +reduce this effect. + +To prevent triggering hangs or consuming excessive time, there is a +default timeout of 1s, trigger will be killed if timeout, user can +modify timeout by setting environment *_TIMEOUT or delete timeout by +setting *_TIMEOUT to 0. + +Environment of trigger in /etc/sysconfig/rasdaemon: + +TRIGGER_DIR: The trigger diretory + +MC_CE_TRIGGER: The script executed when corrected mc_event occurs. +MC_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_CE_TRIGGER, set 0 to +delete timeout. +MC_UE_TRIGGER: The script executed when uncorrected mc_event occurs. +MC_UE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_UE_TRIGGER, set 0 to +delete timeout. + +MCE_CE_TRIGGER: The script executed when corrected mce_record occurs. +MCE_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MCE_CE_TRIGGER, set 0 to +delete timeout. +MCE_UE_TRIGGER: The script executed when uncorrected mce_record occurs. +MCE_UE_TRIGGER_TIMEOUT: Timeout(seconds) for MCE_UE_TRIGGER, set 0 to +delete timeout. + +No script will be executed if *_CE_TRIGGER/*_UE_TRIGGER is null. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 8 +- + contrib/mc_event_trigger | 24 +++++ + contrib/mce_record_trigger | 36 ++++++++ + misc/rasdaemon.env | 31 +++++++ + ras-events.c | 3 + + ras-mc-handler.c | 63 +++++++++++++ + ras-mce-handler.c | 89 ++++++++++++++++++ + trigger.c | 184 +++++++++++++++++++++++++++++++++++++ + trigger.h | 29 ++++++ + 9 files changed, 463 insertions(+), 4 deletions(-) + create mode 100755 contrib/mc_event_trigger + create mode 100755 contrib/mce_record_trigger + create mode 100644 trigger.c + create mode 100644 trigger.h + +diff --git a/Makefile.am b/Makefile.am +index f410c6d..2e4fe39 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -17,7 +17,7 @@ all-local: $(SYSTEMD_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +- bitfield.c ++ bitfield.c trigger.c + if WITH_SQLITE3 + rasdaemon_SOURCES += ras-record.c + endif +@@ -74,7 +74,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ +- non-standard-yitian.h ++ non-standard-yitian.h trigger.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +@@ -101,6 +101,6 @@ upload: + # custom target + install-data-local: + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" +-if WITH_MEMORY_CE_PFA ++ $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" + $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" +-endif ++ $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" +diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger +new file mode 100755 +index 0000000..5c6ccfa +--- /dev/null ++++ b/contrib/mc_event_trigger +@@ -0,0 +1,24 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon in daemon mode when a ++# mc_event is occured, environment variables include all information ++# reported by tracepoint. ++# ++# environment: ++# TIMESTAMP Timestamp when error occurred ++# COUNT Number of errors of the same type ++# TYPE Error type from Corrected/Uncorrected ++# MESSAGE Error message ++# LABEL Label of the affected DIMM(s) ++# MC_INDEX DIMM identifier from DMI/SMBIOS if available ++# TOP_LAYER Top layer of the error ++# MIDDLE_LAYER Middle layer of the error ++# LOWER_LAYER Low layer of the error ++# ADDRESS Error address ++# GRAIN Minimum granularity for an error report, in bytes ++# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable) ++# DRIVER_DETAIL Other driver-specific detail about the error ++# ++ ++[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local ++ ++exit 0 +diff --git a/contrib/mce_record_trigger b/contrib/mce_record_trigger +new file mode 100755 +index 0000000..06a52d9 +--- /dev/null ++++ b/contrib/mce_record_trigger +@@ -0,0 +1,36 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon in daemon mode when a ++# mc_event is occured, environment variables include all information ++# reported by tracepoint. ++# ++# environment: ++# MCGCAP MCGCAP MSR: machine check capabilities of CPU ++# MCGSTATUS Machine Check Global Status MSR ++# STATUS Bank's MCi_STATUS MSR ++# ADDR Bank's MCi_ADDR MSR ++# MISC Bank's MCi_MISC MSR ++# IP Instruction Pointer when the error happened ++# TSC CPU time stamp counter ++# WALLTIME Wall time_t when error was detected ++# CPU CPU number; obsoleted by extcpu ++# CPUID CPUID 1 EAX ++# APICID CPU initial APIC ID ++# SOCKETID CPU socket ID ++# CS Code segment ++# BANK Machine check bank reporting the error ++# CPUVENDOR Kernel's X86_VENDOR enum ++# SYND MCA_SYND MSR: only valid on SMCA systems ++# IPID MCA_IPID MSR: only valid on SMCA systems ++# TIMESTAMP Rasdaemon timestamp ++# BANK_NAME Decode ban name ++# ERROR_MSG Vendor define error message ++# MCGSTATUS_MSG Decode mcgstatus ++# MCISTATUS_MSG Decode mcistatus ++# MCASTATUS_MSG Decode mcastatus ++# USER_ACTION Recommendations for actions users should take ++# MC_LOCATION Error location in MC ++# ++ ++[ -x ./mce_record_trigger.local ] && . ./mce_record_trigger.local ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 713875a..9f8e606 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -31,3 +31,34 @@ PAGE_CE_ACTION="soft" + # Notices script when doing memory offline + PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" + PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" ++ ++# Event Trigger ++ ++# Event trigger will be executed when the specified event occurs. ++# ++# Execute triggers path ++# For example: TRIGGER_DIR=/etc/ras/triggers ++TRIGGER_DIR= ++ ++# Execute these triggers when the mc_event occured, the triggers will not ++# be executed if the trigger is not specified. ++# You can set timeout for trigger, trigger thread will be killed if timeout. ++# The default timeout is 1, if you do not want any timeout, set it to 0. ++# For example: ++# MC_CE_TRIGGER=mc_event_trigger ++# MC_UE_TRIGGER=mc_event_trigger ++# MC_CE_TRIGGER_TIMEOUT=1 ++# MC_UE_TRIGGER_TIMEOUT=1 ++ ++# trigger for mc_event ++MC_CE_TRIGGER= ++MC_UE_TRIGGER= ++MC_CE_TRIGGER_TIMEOUT=0 ++MC_UE_TRIGGER_TIMEOUT=0 ++ ++# trigger for mce_record ++MCE_CE_TRIGGER= ++MCE_UE_TRIGGER= ++MCE_CE_TRIGGER_TIMEOUT=0 ++MCE_UE_TRIGGER_TIMEOUT=0 ++ +diff --git a/ras-events.c b/ras-events.c +index fe4bd26..016f531 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -41,6 +41,7 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" ++#include "trigger.h" + + /* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never +@@ -815,6 +816,8 @@ int handle_ras_events(int record_events) + ras_page_account_init(); + #endif + ++ trigger_setup(); ++ + rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", + ras_mc_event_handler, NULL, MC_EVENT); + if (!rc) +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index 42b05cd..0081d95 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -15,16 +15,73 @@ + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ ++#define _GNU_SOURCE + #include + #include + #include + #include + #include "libtrace/kbuffer.h" ++#include + #include "ras-mc-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" + #include "ras-report.h" ++#include "trigger.h" ++ ++struct event_trigger mc_ce_trigger = {"mc_event", "MC_CE_TRIGGER"}; ++struct event_trigger mc_ue_trigger = {"mc_event", "MC_UE_TRIGGER"}; ++ ++static void run_mc_trigger(struct ras_mc_event *ev, ++ struct event_trigger *trigger, ++ struct trace_seq *s) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0, i; ++ char msg[4096]; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) ++ goto free; ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env, "mc_event", msg); ++ ++ trace_seq_printf(s, " %s", msg); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} + + int ras_mc_event_handler(struct trace_seq *s, + struct pevent_record *record, +@@ -195,6 +252,12 @@ int ras_mc_event_handler(struct trace_seq *s, + ras_report_mc_event(ras, &ev); + #endif + ++ if (!strcmp(ev.error_type, "Corrected")) ++ run_mc_trigger(&ev, &mc_ce_trigger, s); ++ ++ if (!strcmp(ev.error_type, "Uncorrected")) ++ run_mc_trigger(&ev, &mc_ue_trigger, s); ++ + return 0; + + parse_error: +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 805004a..ac2c4a1 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -15,6 +15,7 @@ + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ ++#define _GNU_SOURCE + #include + #include + #include +@@ -22,11 +23,13 @@ + #include + #include + #include ++#include + #include "libtrace/kbuffer.h" + #include "ras-mce-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" ++#include "trigger.h" + + /* + * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, +@@ -233,6 +236,85 @@ ret: + return ret; + } + ++struct event_trigger mce_ce_trigger = {"mce_record", "MCE_CE_TRIGGER"}; ++struct event_trigger mce_de_trigger = {"mce_record", "MCE_DE_TRIGGER"}; ++struct event_trigger mce_ue_trigger = {"mce_record", "MCE_UE_TRIGGER"}; ++ ++static void run_mce_trigger(struct mce_event *e, ++ struct event_trigger *trigger, ++ struct trace_seq *s) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0, i; ++ char msg[4096]; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGCAP=%#lx", e->mcgcap) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGSTATUS=%#lx", e->mcgstatus) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "STATUS=%#lx", e->status) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ADDR=%#lx", e->addr) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MISC=%#lx", e->misc) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "IP=%#lx", e->ip) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TSC=%#lx", e->tsc) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "WALLTIME=%#lx", e->walltime) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPU=%#x", e->cpu) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPUID=%#x", e->cpuid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "APICID=%#x", e->apicid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SOCKETID=%#x", e->socketid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CS=%#x", e->cs) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BANK=%#x", e->bank) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPUVENDOR=%#x", e->cpuvendor) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SYND=%#lx", e->synd) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "IPID=%#lx", e->ipid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", e->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BANK_NAME=%s", e->bank_name) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ERROR_MSG=%s", e->error_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGSTATUS_MSG=%s", e->mcgstatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCISTATUS_MSG=%s", e->mcistatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCASTATUS_MSG=%s", e->mcastatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "USER_ACTION=%s", e->user_action) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MC_LOCATION=%s", e->mc_location) < 0) ++ goto free; ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env, "mce_record", msg); ++ ++ trace_seq_printf(s, " %s", msg); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} ++ + int register_mce_handler(struct ras_events *ras, unsigned ncpus) + { + int rc; +@@ -480,5 +562,12 @@ int ras_mce_event_handler(struct trace_seq *s, + ras_report_mce_event(ras, &e); + #endif + ++ if (e.status & MCI_STATUS_UC) ++ run_mce_trigger(&e, &mce_ue_trigger, s); ++ else if (e.status & MCI_STATUS_DEFERRED) ++ run_mce_trigger(&e, &mce_de_trigger, s); ++ else ++ run_mce_trigger(&e, &mce_ce_trigger, s); ++ + return 0; + } +diff --git a/trigger.c b/trigger.c +new file mode 100644 +index 0000000..8716f50 +--- /dev/null ++++ b/trigger.c +@@ -0,0 +1,184 @@ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include "ras-logger.h" ++#include "trigger.h" ++#include ++ ++#define READ 0 ++#define WRITE 1 ++ ++static int child_done, alarm_done; ++static char *trigger_dir; ++ ++static void child_handler(int sig) ++{ ++ child_done = 1; ++} ++ ++static void alarm_handler(int sig) ++{ ++ alarm_done = 1; ++} ++ ++void run_trigger(struct event_trigger *t, char *argv[], char **env, ++ const char* reporter, char *msg) ++{ ++ pid_t child; ++ char *path, err[256] = {0}, *trigger = t->path; ++ int status, pipe_stdout[2], pipe_stderr[2], timeout = t->timeout; ++ ssize_t byte = 0; ++ ++ log(TERM, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); ++ ++ if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0) ++ return; ++ ++ if (pipe(pipe_stdout) == -1) ++ exit(EXIT_FAILURE); ++ ++ if (pipe(pipe_stderr) == -1) ++ exit(EXIT_FAILURE); ++ ++ child = fork(); ++ if (child < 0) { ++ log(TERM, LOG_ERR, "Cannot create process for trigger"); ++ return; ++ } else if (child == 0) { ++ close(pipe_stdout[READ]); ++ close(pipe_stderr[READ]); ++ dup2(pipe_stdout[WRITE], 1); ++ dup2(pipe_stderr[WRITE], 2); ++ close(pipe_stdout[WRITE]); ++ close(pipe_stderr[WRITE]); ++ ++ execve(path, argv, env); ++ exit(EXIT_FAILURE); ++ } ++ ++ signal(SIGCHLD, child_handler); ++ ++ close(pipe_stdout[WRITE]); ++ close(pipe_stderr[WRITE]); ++ ++ if (timeout) { ++ signal(SIGALRM, alarm_handler); ++ alarm(timeout); ++ } ++ ++ pause(); ++ ++ if (child_done) { ++ if (waitpid(child, &status, WNOHANG) == child){ ++ if (WIFEXITED(status) && WEXITSTATUS(status)) ++ log(TERM, LOG_INFO, "Trigger %s exited with status %d\n", ++ trigger, WEXITSTATUS(status)); ++ else if (WIFSIGNALED(status)) ++ log(TERM, LOG_INFO, "Trigger %s killed by signal %d\n", ++ trigger, WTERMSIG(status)); ++ } ++ alarm(0); ++ } else if (alarm_done) { ++ log(TERM, LOG_ERR, "Trigger timeout, kill it\n"); ++ kill(child, SIGKILL); ++ } ++ signal(SIGCHLD, SIG_DFL); ++ signal(SIGALRM, SIG_DFL); ++ ++ byte = read(pipe_stderr[READ], err, 256); ++ if (byte > 0) ++ log(TERM, LOG_ERR, "Trigger stderr: %s\n", err); ++ else if (byte < 0) ++ log(TERM, LOG_ERR, "Trigger error : %s\n", strerror(byte)); ++ ++ byte = read(pipe_stdout[READ], msg, 4096); ++ if (byte < 0) ++ log(TERM, LOG_ERR, "Trigger error : %s\n", strerror(byte)); ++ ++ close(pipe_stdout[READ]); ++ close(pipe_stderr[READ]); ++} ++ ++int trigger_check(char *s) ++{ ++ char *name; ++ int rc; ++ ++ if (trigger_dir) { ++ if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) ++ return -1; ++ } else ++ name = s; ++ ++ rc = access(name, R_OK|X_OK); ++ ++ if (trigger_dir) ++ free(name); ++ ++ return rc; ++} ++ ++static struct event_trigger *event_triggers[] = { ++ &mc_ce_trigger, ++ &mc_ue_trigger, ++#ifdef HAVE_MCE ++ &mce_ce_trigger, ++ &mce_de_trigger, ++ &mce_ue_trigger, ++#endif ++}; ++ ++void trigger_setup(void) ++{ ++ int i, j; ++ struct event_trigger *trigger; ++ char *s, timeout_env[30]; ++ ++ trigger_dir = getenv("TRIGGER_DIR"); ++ ++ for (i = 0; i < ARRAY_SIZE(event_triggers); i++) { ++ trigger = event_triggers[i]; ++ ++ s = getenv(trigger->env); ++ if (!s || !strcmp(s, "")) ++ continue; ++ ++ trigger->path = s; ++ if (trigger_check(s) < 0) ++ log(SYSLOG, LOG_ERR, "Cannot access trigger `%s`\n", s); ++ else ++ log(SYSLOG, LOG_NOTICE, "Setup %s trigger `%s`\n", ++ trigger->event_name, s); ++ ++ sprintf(timeout_env, "%s_TIMEOUT", trigger->env); ++ ++ trigger->timeout = 1; ++ s = getenv(timeout_env); ++ if (!s || !strcmp(s, "")) { ++ log(SYSLOG, LOG_NOTICE, ++ "Setup %s trigger default timeout 1s", ++ trigger->event_name); ++ continue; ++ } ++ ++ j = atoi(s); ++ if (j < 0) ++ log(SYSLOG, LOG_ERR, ++ "Invalid %s trigger timeout `%d`" ++ "use default value: 1s\n", ++ trigger->event_name, j); ++ else if (j == 0) { ++ log(SYSLOG, LOG_NOTICE, ++ "%s trigger no timeout\n", trigger->event_name); ++ trigger->timeout = 0; ++ } else { ++ log(SYSLOG, LOG_NOTICE, ++ "Setup %s trigger timeout `%d`s\n", ++ trigger->event_name, j); ++ trigger->timeout = j; ++ } ++ } ++} +diff --git a/trigger.h b/trigger.h +new file mode 100644 +index 0000000..8a6e380 +--- /dev/null ++++ b/trigger.h +@@ -0,0 +1,29 @@ ++#ifndef __TRIGGER_H__ ++#define __TRIGGER_H__ ++ ++#include "config.h" ++ ++#define MAX_ENV 30 ++#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) ++ ++struct event_trigger { ++ const char *event_name; ++ const char *env; ++ char *path; ++ int timeout; ++}; ++ ++int trigger_check(char *s); ++void run_trigger(struct event_trigger *t, char *argv[], char **env, ++ const char* reporter, char *msg); ++void trigger_setup(void); ++ ++extern struct event_trigger mc_ce_trigger; ++extern struct event_trigger mc_ue_trigger; ++#ifdef HAVE_MCE ++extern struct event_trigger mce_ce_trigger; ++extern struct event_trigger mce_de_trigger; ++extern struct event_trigger mce_ue_trigger; ++#endif ++ ++#endif +-- +2.33.1 + diff --git a/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch b/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch new file mode 100644 index 0000000000000000000000000000000000000000..e2af5ee0fa147cae8fe383c2438d8664457ff78d --- /dev/null +++ b/3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch @@ -0,0 +1,104 @@ +From 248531d736be425ea1a767def8176e04bac3d819 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 12 Dec 2023 10:46:11 +0800 +Subject: [PATCH 2/6] rasdaemon: Do't process Ampere specific error in the + public code + +Ampere specific error info and error handler need to included in +HAVE_AMP_NS_DECODE macro. + +Signed-off-by: Ruidong Tian +--- + ras-arm-handler.c | 7 +++---- + ras-record.c | 4 ++++ + ras-record.h | 2 ++ + 3 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 1149dc6..d81daec 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -51,7 +51,6 @@ int ras_arm_event_handler(struct trace_seq *s, + time_t now; + struct tm *tm; + struct ras_arm_event ev; +- int len = 0; + memset(&ev, 0, sizeof(ev)); + + /* +@@ -99,6 +98,9 @@ int ras_arm_event_handler(struct trace_seq *s, + ev.psci_state = val; + trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); + ++#ifdef HAVE_AMP_NS_DECODE ++ int len = 0; ++ + if (pevent_get_field_val(s, event, "pei_len", record, &val, 1) < 0) + return -1; + ev.pei_len = val; +@@ -131,12 +133,9 @@ int ras_arm_event_handler(struct trace_seq *s, + if (!ev.vsei_error) + return -1; + +-#ifdef HAVE_AMP_NS_DECODE + //decode ampere specific error + decode_amp_payload0_err_regs(NULL, s, + (struct amp_payload0_type_sec *)ev.vsei_error); +-#else +- display_raw_data(s, ev.vsei_error, ev.oem_len); + #endif + + /* Insert data into the SGBD */ +diff --git a/ras-record.c b/ras-record.c +index d845f81..04ad094 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -212,9 +212,11 @@ static const struct db_fields arm_event_fields[] = { + { .name="mpidr", .type="INTEGER" }, + { .name="running_state", .type="INTEGER" }, + { .name="psci_state", .type="INTEGER" }, ++#ifdef HAVE_AMP_NS_DECODE + { .name="err_info", .type="BLOB" }, + { .name="context_info", .type="BLOB" }, + { .name="vendor_info", .type="BLOB" }, ++#endif + }; + + static const struct db_table_descriptor arm_event_tab = { +@@ -238,12 +240,14 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) + sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr); + sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state); + sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state); ++#ifdef HAVE_AMP_NS_DECODE + sqlite3_bind_blob (priv->stmt_arm_record, 7, + ev->pei_error, ev->pei_len, NULL); + sqlite3_bind_blob (priv->stmt_arm_record, 8, + ev->ctx_error, ev->ctx_len, NULL); + sqlite3_bind_blob (priv->stmt_arm_record, 9, + ev->vsei_error, ev->oem_len, NULL); ++#endif + + rc = sqlite3_step(priv->stmt_arm_record); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +diff --git a/ras-record.h b/ras-record.h +index d9f7733..86678b2 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -77,12 +77,14 @@ struct ras_arm_event { + int64_t midr; + int32_t running_state; + int32_t psci_state; ++#ifdef HAVE_AMP_NS_DECODE + const uint8_t *pei_error; + uint32_t pei_len; + const uint8_t *ctx_error; + uint32_t ctx_len; + const uint8_t *vsei_error; + uint32_t oem_len; ++#endif + }; + + struct devlink_event { +-- +2.33.1 + diff --git a/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch b/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch new file mode 100644 index 0000000000000000000000000000000000000000..229e0c4582e36006edc170cbb106edca5588d33f --- /dev/null +++ b/3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch @@ -0,0 +1,56 @@ +From dce53f6809c4fdab967ecc78f80c8ec2ebd89aca Mon Sep 17 00:00:00 2001 +From: Xiaofei Tan +Date: Wed, 20 Oct 2021 14:33:37 +0800 +Subject: [PATCH 3/6] rasdaemon: Fix the issue of sprintf data type mismatch in + uuid_le() + +The data type of sprintf called in the function uuid_le() is mismatch. +Arm64 compiler force it to unsigned char by default, and can work normally. +But if someone compile it with the option -fsigned-char, the function +can't work correctly. + +Signed-off-by: Xiaofei Tan +Signed-off-by: Mauro Carvalho Chehab +--- + ras-extlog-handler.c | 2 +- + ras-non-standard-handler.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c +index 5fd3580..1834687 100644 +--- a/ras-extlog-handler.c ++++ b/ras-extlog-handler.c +@@ -152,7 +152,7 @@ static char *uuid_le(const char *uu) + static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; + + for (i = 0; i < 16; i++) { +- p += sprintf(p, "%.2x", uu[le[i]]); ++ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); + switch (i) { + case 3: + case 5: +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 7818ed8..86178bf 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -36,7 +36,7 @@ static char *uuid_le(const char *uu) + static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; + + for (i = 0; i < 16; i++) { +- p += sprintf(p, "%.2x", uu[le[i]]); ++ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); + switch (i) { + case 3: + case 5: +@@ -61,7 +61,7 @@ static int uuid_le_cmp(const char *sec_type, const char *uuid2) + 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; + + for (i = 0; i < 16; i++) +- p += sprintf(p, "%.2x", sec_type[le[i]]); ++ p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); + *p = 0; + return strncmp(uuid1, uuid2, 32); + } +-- +2.33.1 + diff --git a/3004-rasdaemon-ensure-trace_clock-file-exist.patch b/3004-rasdaemon-ensure-trace_clock-file-exist.patch new file mode 100644 index 0000000000000000000000000000000000000000..eed9c40ee38186d0effe8d500aa9031312ad0f50 --- /dev/null +++ b/3004-rasdaemon-ensure-trace_clock-file-exist.patch @@ -0,0 +1,54 @@ +From 5cfecb69e04d964d4f71f4ccd2a6ce1fc2690f78 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 30 May 2024 19:13:21 +0800 +Subject: [PATCH 4/6] rasdaemon: ensure trace_clock file exist + +Fix https://github.com/mchehab/rasdaemon/issues/74 + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +diff --git a/ras-events.c b/ras-events.c +index 016f531..544c418 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -98,6 +98,18 @@ static int get_debugfs_dir(char *tracing_dir, size_t len) + return ENOENT; + } + ++static int stat_trace(struct ras_events *ras, char *name) ++{ ++ char fname[MAX_PATH + 1]; ++ struct stat file_info; ++ ++ strcpy(fname, ras->tracing); ++ strcat(fname, "/"); ++ strcat(fname, name); ++ ++ return stat(fname, &file_info); ++} ++ + static int open_trace(struct ras_events *ras, char *name, int flags) + { + char fname[MAX_PATH + 1]; +@@ -619,12 +631,14 @@ static void *handle_ras_events_cpu(void *priv) + static int select_tracing_timestamp(struct ras_events *ras) + { + FILE *fp; +- int fd, rc; ++ int fd, rc, retry = 10; + time_t uptime, now; + size_t size; + unsigned j1; + char buf[4096]; + ++ while (stat_trace(ras, "trace_clock") && retry--); ++ + /* Check if uptime is supported (kernel 3.10-rc1 or upper) */ + fd = open_trace(ras, "trace_clock", O_RDONLY); + if (fd < 0) { +-- +2.33.1 + diff --git a/3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch b/3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch new file mode 100644 index 0000000000000000000000000000000000000000..6b9435d3c2bf4c6ae42971c55e0b3b06e3c959b4 --- /dev/null +++ b/3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch @@ -0,0 +1,33 @@ +From 5befe6d0f28971c8bde2302b535c80957718ef30 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 6 Jun 2024 14:22:05 +0800 +Subject: [PATCH 5/6] rasdaemon: mce_record print just one line on AMD + +AMD SMCA will print 2 line for mce_record, like: + + <...>-1106 [002] 0.010002: mce_record: ... + Memory Error 'mem-tx... + +Delete '\n' in amd smca decoder to print just oneline + +Signed-off-by: Ruidong Tian +--- + mce-amd-smca.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 7c619fd..8291c3a 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -481,7 +481,7 @@ static void decode_smca_error(struct mce_event *e) + /* Only print the descriptor of valid extended error code */ + if (xec < smca_mce_descs[bank_type].num_descs) + mce_snprintf(e->mcastatus_msg, +- " %s.\n", smca_mce_descs[bank_type].descs[xec]); ++ " %s. ", smca_mce_descs[bank_type].descs[xec]); + + if (bank_type == SMCA_UMC && xec == 0) { + channel = find_umc_channel(e); +-- +2.33.1 + diff --git a/3006-rasdaemon-disable-ce-offline-default.patch b/3006-rasdaemon-disable-ce-offline-default.patch new file mode 100644 index 0000000000000000000000000000000000000000..6fec9fd1a55e7a122cf88f19eb34da58e9d89952 --- /dev/null +++ b/3006-rasdaemon-disable-ce-offline-default.patch @@ -0,0 +1,48 @@ +From ab8f363f4ffcbc49bf700ca0199ff2b8f9bba65a Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 28 Jun 2024 10:06:40 +0800 +Subject: [PATCH] rasdaemon: disable ce offline default + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 8 +++++--- + ras-page-isolation.c | 2 +- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 9f8e606..1b5403c 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -26,11 +26,13 @@ PAGE_CE_THRESHOLD="50" + # Requires an uptodate kernel. Might not be successfull. + # soft-then-hard First try to soft offline, then try hard offlining. + # Note: default offline choice is "soft". +-PAGE_CE_ACTION="soft" ++PAGE_CE_ACTION="off" + + # Notices script when doing memory offline +-PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" +-PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" ++# PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" ++# PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" ++PAGE_CE_OFFLINE_PRE_NOTICE="" ++PAGE_CE_OFFLINE_POST_NOTICE="" + + # Event Trigger + +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 193d47c..3c777e6 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -217,7 +217,7 @@ static void page_notice_init(void) + char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); + char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); + +- if (offline <= OFFLINE_ACCOUNT) ++ if (offline <= OFFLINE_ACCOUNT || !pre_re || !post_re) + return; + + snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); +-- +2.33.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 9f6dca4f02ff88ea82e78d048406dfca2290ae02..8818fa1c05f42f0771debca7336bac090dc8cc72 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,7 +1,7 @@ %define anolis_release .0.1 Name: rasdaemon Version: 0.6.7 -Release: 9%{?dist} +Release: 10%{?dist} Summary: Utility to receive RAS error tracings License: GPLv2 URL: http://git.infradead.org/users/mchehab/rasdaemon.git @@ -35,6 +35,14 @@ Patch2002: 2002-rasdaemon-log-non_standard_event-at-just-one-line.patch Patch2003: 2003-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch Patch2004: 2004-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch +Patch3001: 3001-rasdaemon-add-mc_event-and-mce_record-trigger.patch +Patch3002: 3002-rasdaemon-Do-t-process-Ampere-specific-error-in-the-.patch +Patch3003: 3003-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch +Patch3004: 3004-rasdaemon-ensure-trace_clock-file-exist.patch +Patch3005: 3005-rasdaemon-mce_record-print-just-one-line-on-AMD.patch +Patch3006: 3006-rasdaemon-disable-ce-offline-default.patch + + ExcludeArch: s390 s390x BuildRequires: make BuildRequires: gcc @@ -78,13 +86,20 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch2003 -p1 %patch2004 -p1 +%patch3001 -p1 +%patch3002 -p1 +%patch3003 -p1 +%patch3004 -p1 +%patch3005 -p1 +%patch3006 -p1 + autoreconf -vfi %build %ifarch %{arm} aarch64 -%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-arm --enable-hisi-ns-decode --enable-yitian-ns-decode +%configure --enable-sqlite3 --enable-aer --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-arm --enable-hisi-ns-decode --enable-yitian-ns-decode --enable-memory-ce-pfa %else -%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure +%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-memory-failure --enable-memory-ce-pfa %endif make %{?_smp_mflags} @@ -97,7 +112,6 @@ mkdir -p %{buildroot}/%{_sharedstatedir}/rasdaemon install -d -p -m 0755 %{buildroot}/%{_sharedstatedir}/rasdaemon mkdir -p %{buildroot}/%{_sysconfdir}/sysconfig install -D -p -m 0644 misc/rasdaemon.env %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon -sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon %ifarch %{arm} aarch64 install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ @@ -110,14 +124,18 @@ install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notice %{_mandir}/*/* %{_unitdir}/*.service %{_sharedstatedir}/rasdaemon -%{_sysconfdir}/ras/dimm_labels.d -%{_sysconfdir}/sysconfig/rasdaemon -%ifarch %{arm} aarch64 +%{_sysconfdir}/ras/dimm_labels.d/ %config(noreplace) %{_sysconfdir}/sysconfig/%{name} +%ifarch %{arm} aarch64 %config(noreplace) %{_sysconfdir}/rasdaemon_notices/* %endif +%config(noreplace) %{_sysconfdir}/ras/triggers/* %changelog +* Thu Jul 02 2024 Ruidong Tian - 0.6.7-10 +- rasdaemon: add mce and mc trigger +- rasdaemon: AMD mce record just print one line + * Thu Sep 02 2023 Ruidong Tian - 0.6.7-9 - rasdaemon: add decoder to decode yitian ns error