diff --git a/1086-ras_event-fix-used-for-uninitialize-variable.patch b/1086-ras_event-fix-used-for-uninitialize-variable.patch new file mode 100644 index 0000000000000000000000000000000000000000..d9981f248c69f518c259f6dc5d42db47b4c32588 --- /dev/null +++ b/1086-ras_event-fix-used-for-uninitialize-variable.patch @@ -0,0 +1,96 @@ +From c12de7d934986a8afd61b51ca86b1745ffa45682 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Mon, 10 Mar 2025 11:28:18 +0800 +Subject: [PATCH 1/5] ras_event: fix used for uninitialize variable + +Signed-off-by: Ruidong Tian +--- + ras-aer-handler.c | 2 +- + ras-arm-handler.c | 2 +- + ras-mc-handler.c | 2 +- + ras-mce-handler.c | 2 +- + ras-memory-failure-handler.c | 2 +- + ras-signal-handler.c | 2 +- + 6 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index d0eb4df..dd96201 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -69,7 +69,7 @@ int ras_aer_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_aer_event ev; ++ struct ras_aer_event ev = { 0 }; + char buf[BUF_LEN]; + char ipmi_add_sel[105]; + uint8_t sel_data[5]; +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 34f29cd..6b763d1 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -158,7 +158,7 @@ int ras_arm_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_arm_event ev; ++ struct ras_arm_event ev = { 0 }; + int len = 0; + + memset(&ev, 0, sizeof(ev)); +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index bb93c9d..48fa744 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -41,7 +41,7 @@ int ras_mc_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_mc_event ev; ++ struct ras_mc_event ev = { 0 }; + int parsed_fields = 0; + + /* +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index e53854d..2fe769a 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -495,7 +495,7 @@ int ras_mce_event_handler(struct trace_seq *s, + unsigned long long val; + struct ras_events *ras = context; + struct mce_priv *mce = ras->mce_priv; +- struct mce_event e; ++ struct mce_event e = { 0 }; + int rc = 0; + + memset(&e, 0, sizeof(e)); +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 9cd56b4..fc6b1a4 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -124,7 +124,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_mf_event ev; ++ struct ras_mf_event ev = { 0 }; + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +index 271f2c9..4e9dc18 100644 +--- a/ras-signal-handler.c ++++ b/ras-signal-handler.c +@@ -66,7 +66,7 @@ int ras_signal_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_signal_event ev; ++ struct ras_signal_event ev = { 0 }; + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +-- +2.33.1 + diff --git a/1087-kmsg-use-clock_gettime-instead-of-clock.patch b/1087-kmsg-use-clock_gettime-instead-of-clock.patch new file mode 100644 index 0000000000000000000000000000000000000000..425458543cdc46e9a5026cb32af18f246d485f78 --- /dev/null +++ b/1087-kmsg-use-clock_gettime-instead-of-clock.patch @@ -0,0 +1,98 @@ +From 52d2b4a3f5953621b5700efbb03a75006de9d18d Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Mon, 10 Mar 2025 11:22:15 +0800 +Subject: [PATCH 2/5] kmsg: use clock_gettime instead of clock + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 40 ++++++++++++++++++++++++++-------------- + 1 file changed, 26 insertions(+), 14 deletions(-) + +diff --git a/ras-events.c b/ras-events.c +index 845e879..bc6cd47 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -51,6 +52,21 @@ + #include "trigger.h" + #include "ras-kmsg.h" + ++#define NS_PER_SEC 1000000000L ++ ++static struct timespec ts_sub(struct timespec a, struct timespec b) { ++ struct timespec result = { ++ .tv_sec = a.tv_sec - b.tv_sec, ++ .tv_nsec = a.tv_nsec - b.tv_nsec ++ }; ++ ++ if (result.tv_nsec < 0) { ++ result.tv_sec -= 1; ++ result.tv_nsec += NS_PER_SEC; ++ } ++ return result; ++} ++ + /* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never + * blocks on read(). So, we need to sleep for a while, to avoid spending +@@ -479,7 +495,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + int fd_num = n_cpus + 2; + char kmsg_buf[PRINTK_MESSAGE_MAX]; + int limit = 0; +- clock_t limit_time = clock(); ++ struct timespec limit_time = { 0 }; + int need_sleep = 0; + #else + int fd_num = n_cpus + 1; +@@ -605,32 +621,28 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + if (kmsg_monitor && (fds[n_cpus + 1].revents & POLLIN)) { + size = read(fds[n_cpus + 1].fd, kmsg_buf, PRINTK_MESSAGE_MAX); + if (size < 0) { +- log(TERM, LOG_WARNING, "read kmsg\n"); +- goto error; ++ log(TERM, LOG_WARNING, "read kmsg %s\n", strerror(errno)); + } else if (size > 0) { ++ kmsg_buf[size] = '\0'; + kmsg_match(kmsg_buf); + amdgpu_tracer_match(kmsg_buf); +- memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX); + } else { + count_nready++; + } + limit++; + if (kmsg_limit && limit >= kmsg_limit) { +- clock_t now = clock(); +- +- if ((double)(now - limit_time) / CLOCKS_PER_SEC <= 0.5) { +- need_sleep = 1; +- log(ALL, LOG_WARNING, "kmsg limit!\n"); ++ struct timespec tv, res; + ++ clock_gettime(CLOCK_MONOTONIC, &tv); + +- if (lseek(fds[n_cpus + 1].fd, 0, SEEK_END) == -1) { +- log(TERM, LOG_ERR, "Can not seek kmsg end\n"); +- goto error; +- } ++ res = ts_sub(tv, limit_time); ++ if (res.tv_sec == 0 && res.tv_nsec >= 0 && res.tv_nsec < (0.5 * NS_PER_SEC)) { ++ need_sleep = 1; ++ log(TERM, LOG_WARNING, "kmsg limit %lx!\n", res.tv_nsec); + } + + limit = 0; +- limit_time = now; ++ limit_time = tv; + } + + +-- +2.33.1 + diff --git a/1088-signal-just-foucs-Hardware-error-SUGBUS.patch b/1088-signal-just-foucs-Hardware-error-SUGBUS.patch new file mode 100644 index 0000000000000000000000000000000000000000..ef9f60fd34b333d0816e0ef93862c2091ddb8eda --- /dev/null +++ b/1088-signal-just-foucs-Hardware-error-SUGBUS.patch @@ -0,0 +1,28 @@ +From e72f12858c99def7bc39fdf0bb97ecda007b07c1 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Mon, 10 Mar 2025 11:24:22 +0800 +Subject: [PATCH 3/5] signal: just foucs Hardware error SUGBUS + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/ras-events.c b/ras-events.c +index bc6cd47..f0dcac6 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -1178,7 +1178,9 @@ int handle_ras_events(int record_events) + #endif + + #ifdef HAVE_SIGNAL +- snprintf(signal_filter, sizeof(signal_filter), "sig == %d", SIGBUS); ++ snprintf(signal_filter, sizeof(signal_filter), "sig == %d && code >= %d", SIGBUS, BUS_OBJERR); ++ // ensure filter enabled ++ usleep(30000); + rc = filter_ras_mc_event(ras, "signal", "signal_generate", signal_filter); + if (!rc) { + rc = add_event_handler(ras, pevent, page_size, "signal", "signal_generate", +-- +2.33.1 + diff --git a/1089-anolis-config-kmsg-add-cmci-storm-event.patch b/1089-anolis-config-kmsg-add-cmci-storm-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..ba9f238b370535ec23378f3cb99fb6de1093e4a2 --- /dev/null +++ b/1089-anolis-config-kmsg-add-cmci-storm-event.patch @@ -0,0 +1,35 @@ +From b44a373604515d00bb18c7ac342e16e8a67510bb Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Mon, 10 Mar 2025 11:37:12 +0800 +Subject: [PATCH 4/5] anolis: config: kmsg add cmci storm event + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 13f6cea..f6626a7 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -108,7 +108,7 @@ AMDGPU_MCA_ENABLED=1 + # KMSG MONITOR + KMSG_IGNORE_XID="" + KMSG_LIMIT=0 +-KMSG_TRACE_NUM=5 ++KMSG_TRACE_NUM=6 + KMSG_TRACE_END=1 + + KMSG_TRACER_NAME_0="xid" +@@ -135,3 +135,8 @@ KMSG_TRACER_NAME_4="pcihp" + KMSG_TRACER_REGEX_4="pcieport (.*): pciehp: Slot\\(([0-9]+)\\): (Link Up|Link Down|Card present|Card not present|Link Down/Up ignored \\(recovered by DPC\\))" + KMSG_TRACER_GROUP_COUNT_4=3 + KMSG_TRACER_GROUP_KEY_4="pci_port,slot,res" ++ ++KMSG_TRACER_NAME_5="cmci_storm" ++KMSG_TRACER_REGEX_5="CMCI storm (.*): switching to .* mode" ++KMSG_TRACER_GROUP_COUNT_5=1 ++KMSG_TRACER_GROUP_KEY_5="storm" +-- +2.33.1 + diff --git a/1090-anolis-config-kmsg-storm-migration-enabled-default.patch b/1090-anolis-config-kmsg-storm-migration-enabled-default.patch new file mode 100644 index 0000000000000000000000000000000000000000..4bd61f59347b0b67b269c567e3d69978ec64dc77 --- /dev/null +++ b/1090-anolis-config-kmsg-storm-migration-enabled-default.patch @@ -0,0 +1,26 @@ +From 6d5e8d13e1b50a7f5584635fc5ce168b32ec7efe Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Mon, 10 Mar 2025 11:37:40 +0800 +Subject: [PATCH 5/5] anolis: config: kmsg storm migration enabled default + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index f6626a7..e7c115f 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -107,7 +107,7 @@ AMDGPU_MCA_ENABLED=1 + + # KMSG MONITOR + KMSG_IGNORE_XID="" +-KMSG_LIMIT=0 ++KMSG_LIMIT=100 + KMSG_TRACE_NUM=6 + KMSG_TRACE_END=1 + +-- +2.33.1 + diff --git a/1091-anolis-disable-block-and-dev-error-default.patch b/1091-anolis-disable-block-and-dev-error-default.patch new file mode 100644 index 0000000000000000000000000000000000000000..f33b1bfe5aa7c576e046a177428320aae6fffc79 --- /dev/null +++ b/1091-anolis-disable-block-and-dev-error-default.patch @@ -0,0 +1,26 @@ +From e0e0866270b0db663aff2feecd255a082fe32c0c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Wed, 12 Mar 2025 09:59:55 +0800 +Subject: [PATCH] anolis: disable block and dev error default + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index e7c115f..c674136 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -46,7 +46,7 @@ CPU_ISOLATION_CYCLE="24h" + CPU_ISOLATION_LIMIT="10" + + +-DISABLE="" ++DISABLE="block:block_rq_complete,devlink:devlink_health_report" + # Event Trigger + + # Event trigger will be executed when the specified event occurs. +-- +2.33.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index c572168f255c3cc1f3dc258d17fce4b239f349ea..9eff391065fcb04b31fc286fa57b4d457c75b200 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.1 +%define anolis_release .0.2 Name: rasdaemon Version: 0.6.7 Release: 16%{anolis_release}%{?dist} @@ -126,6 +126,13 @@ Patch1083: 1083-rasdaemon-mce-decode-io-port-for-bus-error.patch Patch1084: 1084-anolis-rasdaemon-fix-arm-event-error-output.patch Patch1085: 1085-rasdaemon-disable-ce-offline-default.patch +Patch1086: 1086-ras_event-fix-used-for-uninitialize-variable.patch +Patch1087: 1087-kmsg-use-clock_gettime-instead-of-clock.patch +Patch1088: 1088-signal-just-foucs-Hardware-error-SUGBUS.patch +Patch1089: 1089-anolis-config-kmsg-add-cmci-storm-event.patch +Patch1090: 1090-anolis-config-kmsg-storm-migration-enabled-default.patch +Patch1091: 1091-anolis-disable-block-and-dev-error-default.patch + ExcludeArch: s390 s390x BuildRequires: make BuildRequires: gcc @@ -281,6 +288,13 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch1084 -p1 %patch1085 -p1 +%patch1086 -p1 +%patch1087 -p1 +%patch1088 -p1 +%patch1089 -p1 +%patch1090 -p1 +%patch1091 -p1 + autoreconf -vfi %build @@ -313,7 +327,11 @@ install -D -p -m 0644 misc/rasdaemon.env %{buildroot}/%{_sysconfdir}/sysconfig/r %config(noreplace) %{_sysconfdir}/ras/triggers/* %changelog -* Thu Dec 25 2024 Ruidong Tian - 0.6.7-16.0.1 +* Mon Mar 10 2025 Ruidong Tian - 0.6.7-16.0.2 +- fix signal bug +- refactor kmsg storm migration + +* Wed Dec 25 2024 Ruidong Tian - 0.6.7-16.0.1 - add erst, kmsg, amdgpu_ras * Thu Dec 05 2024 Bixuan Cui - 0.6.7-15.0.1