From f6781c2aa5228ac4ff319f30d56096c6d4308869 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Tue, 15 Jul 2025 14:25:58 +0800 Subject: [PATCH] updae to 0.8.3-1.1 support bert decode some bugfix add some smoke test Signed-off-by: Ruidong Tian --- 1030-rasdaemon-split-report-function.patch | 414 +++++ 1031-rasdaemon-support-BERT-decode.patch | 1630 ++++++++++++++++++++ 1032-config-update-config.patch | 58 + 1033-test-add-testsuite.patch | 214 +++ 1034-ext-fix-pcihp-filter.patch | 26 + rasdaemon.spec | 22 +- 6 files changed, 2361 insertions(+), 3 deletions(-) create mode 100644 1030-rasdaemon-split-report-function.patch create mode 100644 1031-rasdaemon-support-BERT-decode.patch create mode 100644 1032-config-update-config.patch create mode 100644 1033-test-add-testsuite.patch create mode 100644 1034-ext-fix-pcihp-filter.patch diff --git a/1030-rasdaemon-split-report-function.patch b/1030-rasdaemon-split-report-function.patch new file mode 100644 index 0000000..fbf89bc --- /dev/null +++ b/1030-rasdaemon-split-report-function.patch @@ -0,0 +1,414 @@ +From 90b03c1dd98539900a770f983c450fdbb09597c2 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 15 Jul 2025 10:51:19 +0800 +Subject: [PATCH] rasdaemon: split report function + +Signed-off-by: Ruidong Tian +--- + ras-aer-handler.c | 95 ++++++++++++++++++++-------------- + ras-aer-handler.h | 2 + + ras-mc-handler.c | 129 +++++++++++++++++++++++++--------------------- + ras-mc-handler.h | 2 + + 4 files changed, 128 insertions(+), 100 deletions(-) + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 023dd4d..89e531a 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -104,6 +104,52 @@ free: + pci_cleanup(pacc); + } + ++void report_aer_event(struct trace_seq *s, struct ras_aer_event *ev) ++{ ++ char buf[BUF_LEN] = { 0 }; ++ uint16_t vendor_id = 0, device_id = 0; ++ const char *level; ++ ++ switch (ev->severity) { ++ case GHES_SEV_RECOVERABLE: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case GHES_SEV_PANIC: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case GHES_SEV_CORRECTED: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ ++ trace_seq_printf(s, "%s ", level); ++ trace_seq_printf(s, "%s ", ev->timestamp); ++ trace_seq_printf(s, "%s ", ev->dev_name); ++ ++ get_pci_dev_name(ev->dev_name, buf, sizeof(buf), &vendor_id, &device_id); ++ trace_seq_printf(s, "(%s - vendor_id: %#x device_id: %#x) ", buf, vendor_id, device_id); ++ ++ /* Fills the error buffer. If it is a correctable error then use the ++ * aer_cor_errors bit field. Otherwise use aer_uncor_errors. ++ */ ++ if (ev->status == HW_EVENT_AER_CORRECTED) ++ bitfield_msg(buf, sizeof(buf), aer_cor_errors, 32, 0, 0, ev->status); ++ else ++ bitfield_msg(buf, sizeof(buf), aer_uncor_errors, 32, 0, 0, ev->status); ++ ++ ev->msg = buf; ++ if (ev->tlp_header_valid) ++ snprintf((buf + strlen(ev->msg)), BUF_LEN - strlen(ev->msg), ++ " TLP Header: %08x %08x %08x %08x", ++ ev->tlp_header[0], ev->tlp_header[1], ++ ev->tlp_header[2], ev->tlp_header[3]); ++ ++ trace_seq_printf(s, "%s ", ev->msg); ++ trace_seq_puts(s, ev->error_type); ++} + int ras_aer_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +@@ -116,32 +162,11 @@ int ras_aer_event_handler(struct trace_seq *s, + time_t now; + struct tm *tm; + struct ras_aer_event ev = { 0 }; +- char buf[BUF_LEN] = { 0 }; +- uint16_t vendor_id = 0, device_id = 0; + #ifdef HAVE_AMP_NS_DECODE + char ipmi_add_sel[105]; + uint8_t sel_data[5]; + int seg, bus, dev, fn, rc; + #endif +- const char *level; +- +- if (tep_get_field_val(s, event, "severity", record, &severity_val, 1) < 0) +- return -1; +- switch (severity_val) { +- case HW_EVENT_AER_UNCORRECTED_NON_FATAL: +- level = loglevel_str[LOGLEVEL_CRIT]; +- break; +- case HW_EVENT_AER_UNCORRECTED_FATAL: +- level = loglevel_str[LOGLEVEL_EMERG]; +- break; +- case HW_EVENT_AER_CORRECTED: +- level = loglevel_str[LOGLEVEL_ERR]; +- break; +- default: +- level = loglevel_str[LOGLEVEL_DEBUG]; +- break; +- } +- trace_seq_printf(s, "%s ", level); + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -167,22 +192,16 @@ int ras_aer_event_handler(struct trace_seq *s, + record, &len, 1); + if (!ev.dev_name) + return -1; +- trace_seq_printf(s, "%s ", ev.dev_name); +- +- get_pci_dev_name(ev.dev_name, buf, sizeof(buf), &vendor_id, &device_id); +- trace_seq_printf(s, "(%s - vendor_id: %#x device_id: %#x) ", buf, vendor_id, device_id); + + if (tep_get_field_val(s, event, "status", record, &status_val, 1) < 0) + return -1; + +- /* Fills the error buffer. If it is a correctable error then use the +- * aer_cor_errors bit field. Otherwise use aer_uncor_errors. +- */ +- if (severity_val == HW_EVENT_AER_CORRECTED) +- bitfield_msg(buf, sizeof(buf), aer_cor_errors, 32, 0, 0, status_val); +- else +- bitfield_msg(buf, sizeof(buf), aer_uncor_errors, 32, 0, 0, status_val); +- ev.msg = buf; ++ ev.status = status_val; ++ ++ if (tep_get_field_val(s, event, "status", record, &severity_val, 1) < 0) ++ return -1; ++ ++ ev.severity = severity_val; + + if (tep_get_field_val(s, event, "tlp_header_valid", + record, &val, 1) < 0) +@@ -192,13 +211,8 @@ int ras_aer_event_handler(struct trace_seq *s, + if (ev.tlp_header_valid) { + ev.tlp_header = tep_get_field_raw(s, event, "tlp_header", + record, &len, 1); +- snprintf((buf + strlen(ev.msg)), BUF_LEN - strlen(ev.msg), +- " TLP Header: %08x %08x %08x %08x", +- ev.tlp_header[0], ev.tlp_header[1], +- ev.tlp_header[2], ev.tlp_header[3]); +- } + +- trace_seq_printf(s, "%s ", ev.msg); ++ } + + /* Use hw_event_aer_err_type switch between different severity_val */ + switch (severity_val) { +@@ -230,7 +244,8 @@ int ras_aer_event_handler(struct trace_seq *s, + #endif + ev.severity = GHES_SEV_NO; + } +- trace_seq_puts(s, ev.error_type); ++ ++ report_aer_event(s, &ev); + + /* Insert data into the SGBD */ + #ifdef HAVE_SQLITE3 +diff --git a/ras-aer-handler.h b/ras-aer-handler.h +index ef84788..4c6afb2 100644 +--- a/ras-aer-handler.h ++++ b/ras-aer-handler.h +@@ -10,7 +10,9 @@ + #include + + #include "ras-events.h" ++#include "ras-record.h" + ++void report_aer_event(struct trace_seq *s, struct ras_aer_event *ev); + int ras_aer_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index 2ffaf2e..b607024 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -42,6 +42,73 @@ static int ras_mc_event_stat(time_t now, struct ras_mc_event *e) + return 0; + } + ++void report_mc_event(struct ras_mc_event *ev, struct trace_seq *s) ++{ ++ const char *level; ++ ++ switch (ev->severity) { ++ case GHES_SEV_RECOVERABLE: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case GHES_SEV_PANIC: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case GHES_SEV_CORRECTED: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ ++ trace_seq_printf(s, "%s ", level); ++ trace_seq_printf(s, "%s ", ev->timestamp); ++ trace_seq_printf(s, "%d ", ev->error_count); ++ trace_seq_puts(s, ev->error_type); ++ if (ev->error_count > 1) ++ trace_seq_puts(s, " errors:"); ++ else ++ trace_seq_puts(s, " error:"); ++ ++ if (ev->msg) { ++ trace_seq_puts(s, " "); ++ trace_seq_puts(s, ev->msg); ++ } ++ ++ if (ev->label) { ++ trace_seq_puts(s, " on "); ++ trace_seq_puts(s, ev->label); ++ } ++ ++ trace_seq_puts(s, " ("); ++ trace_seq_printf(s, "mc: %d", ev->mc_index); ++ if (ev->top_layer >= 0 || ev->middle_layer >= 0 || ev->lower_layer >= 0) { ++ if (ev->lower_layer >= 0) ++ trace_seq_printf(s, " location: %d:%d:%d", ++ ev->top_layer, ev->middle_layer, ev->lower_layer); ++ else if (ev->middle_layer >= 0) ++ trace_seq_printf(s, " location: %d:%d", ++ ev->top_layer, ev->middle_layer); ++ else ++ trace_seq_printf(s, " location: %d", ev->top_layer); ++ } ++ ++ if (ev->address) ++ trace_seq_printf(s, " address: 0x%08llx", ev->address); ++ ++ trace_seq_printf(s, " grain: %lld", ev->grain); ++ ++ if (ev->syndrome) ++ trace_seq_printf(s, " syndrome: 0x%08llx", ev->syndrome); ++ ++ if (ev->driver_detail) { ++ trace_seq_puts(s, " "); ++ trace_seq_puts(s, ev->driver_detail); ++ } ++ ++ trace_seq_puts(s, ")"); ++} ++ + int ras_mc_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +@@ -53,7 +120,6 @@ int ras_mc_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_mc_event ev = { 0 }; + int parsed_fields = 0; +- const char *level; + + if (tep_get_field_val(s, event, "error_type", record, &val, 1) < 0) + goto parse_error; +@@ -70,6 +136,7 @@ int ras_mc_event_handler(struct trace_seq *s, + break; + case HW_EVENT_ERR_DEFERRED: + ev.error_type = "Deferred"; ++ ev.severity = GHES_SEV_RECOVERABLE; + break; + case HW_EVENT_ERR_FATAL: + ev.error_type = "Fatal"; +@@ -81,23 +148,6 @@ int ras_mc_event_handler(struct trace_seq *s, + ev.severity = GHES_SEV_NO; + } + +- switch (val) { +- case HW_EVENT_ERR_UNCORRECTED: +- case HW_EVENT_ERR_DEFERRED: +- level = loglevel_str[LOGLEVEL_CRIT]; +- break; +- case HW_EVENT_ERR_FATAL: +- level = loglevel_str[LOGLEVEL_EMERG]; +- break; +- case HW_EVENT_ERR_CORRECTED: +- level = loglevel_str[LOGLEVEL_ERR]; +- break; +- default: +- level = loglevel_str[LOGLEVEL_DEBUG]; +- break; +- } +- trace_seq_printf(s, "%s ", level); +- + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +@@ -116,48 +166,28 @@ int ras_mc_event_handler(struct trace_seq *s, + if (tm) + strftime(ev.timestamp, sizeof(ev.timestamp), + "%Y-%m-%d %H:%M:%S %z", tm); +- trace_seq_printf(s, "%s ", ev.timestamp); + + if (tep_get_field_val(s, event, "error_count", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; + + ev.error_count = val; +- trace_seq_printf(s, "%d ", ev.error_count); +- +- trace_seq_puts(s, ev.error_type); +- if (ev.error_count > 1) +- trace_seq_puts(s, " errors:"); +- else +- trace_seq_puts(s, " error:"); + + ev.msg = tep_get_field_raw(s, event, "msg", record, &len, 1); + if (!ev.msg) + goto parse_error; + parsed_fields++; + +- if (*ev.msg) { +- trace_seq_puts(s, " "); +- trace_seq_puts(s, ev.msg); +- } +- + ev.label = tep_get_field_raw(s, event, "label", record, &len, 1); + if (!ev.label) + goto parse_error; + parsed_fields++; + +- if (*ev.label) { +- trace_seq_puts(s, " on "); +- trace_seq_puts(s, ev.label); +- } +- +- trace_seq_puts(s, " ("); + if (tep_get_field_val(s, event, "mc_index", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; + + ev.mc_index = val; +- trace_seq_printf(s, "mc: %d", ev.mc_index); + + if (tep_get_field_val(s, event, "top_layer", record, &val, 1) < 0) + goto parse_error; +@@ -174,39 +204,23 @@ int ras_mc_event_handler(struct trace_seq *s, + parsed_fields++; + ev.lower_layer = (signed char)val; + +- if (ev.top_layer >= 0 || ev.middle_layer >= 0 || ev.lower_layer >= 0) { +- if (ev.lower_layer >= 0) +- trace_seq_printf(s, " location: %d:%d:%d", +- ev.top_layer, ev.middle_layer, ev.lower_layer); +- else if (ev.middle_layer >= 0) +- trace_seq_printf(s, " location: %d:%d", +- ev.top_layer, ev.middle_layer); +- else +- trace_seq_printf(s, " location: %d", ev.top_layer); +- } +- + if (tep_get_field_val(s, event, "address", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; + + ev.address = val; +- if (ev.address) +- trace_seq_printf(s, " address: 0x%08llx", ev.address); + + if (tep_get_field_val(s, event, "grain_bits", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; + + ev.grain = val; +- trace_seq_printf(s, " grain: %lld", ev.grain); + + if (tep_get_field_val(s, event, "syndrome", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; + + ev.syndrome = val; +- if (val) +- trace_seq_printf(s, " syndrome: 0x%08llx", ev.syndrome); + + ev.driver_detail = tep_get_field_raw(s, event, "driver_detail", record, + &len, 1); +@@ -214,12 +228,7 @@ int ras_mc_event_handler(struct trace_seq *s, + goto parse_error; + parsed_fields++; + +- if (*ev.driver_detail) { +- trace_seq_puts(s, " "); +- trace_seq_puts(s, ev.driver_detail); +- } +- trace_seq_puts(s, ")"); +- ++ report_mc_event(&ev, s); + /* Insert data into the SGBD */ + + ras_store_mc_event(ras, &ev); +diff --git a/ras-mc-handler.h b/ras-mc-handler.h +index cf12959..6ff4229 100644 +--- a/ras-mc-handler.h ++++ b/ras-mc-handler.h +@@ -10,6 +10,7 @@ + #include + + #include "ras-events.h" ++#include "ras-record.h" + extern unsigned long long mc_ce_stat_threshold; + + void mc_event_trigger_setup(void); +@@ -17,5 +18,6 @@ void mc_event_trigger_setup(void); + int ras_mc_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++void report_mc_event(struct ras_mc_event *ev, struct trace_seq *s); + + #endif +-- +2.43.5 + diff --git a/1031-rasdaemon-support-BERT-decode.patch b/1031-rasdaemon-support-BERT-decode.patch new file mode 100644 index 0000000..74f8bb5 --- /dev/null +++ b/1031-rasdaemon-support-BERT-decode.patch @@ -0,0 +1,1630 @@ +From 0fa569cf5a40a6e8cd7444b16b7660ffcb69f201 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 15 Jul 2025 10:58:06 +0800 +Subject: [PATCH 1/3] rasdaemon: support BERT decode + +The ACPI Boot Error Record Table (BERT) is a mechanism for firmware to +preserve information about a fatal error that occurred during the boot +process. This information survives a system reset and can be invaluable +for diagnosing failures that happen before the OS is fully operational. + +This patch introduces support for decoding the BERT. On startup, +rasdaemon now performs the following steps: + + - Checks for the existence of the BERT at + `/sys/firmware/acpi/tables/BERT`. + + - If found, it reads the table and iterates through the error data + blocks contained within. + + - It reuses the existing CPER (Common Platform Error Record) parsing + logic (e.g., `cper_print_mem_error`) to decode the generic error + data sections, which are the most common payload for BERT. + + - Use rasdaemon report function to report event, support CPU, MEM, AER + event now + +This enhancement allows system administrators to easily access details +of catastrophic boot failures directly from the system logs, greatly +simplifying root cause analysis. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 5 +- + configure.ac | 11 + + cper.h | 593 +++++++++++++++++++++++++++++++++++++++++++++++++ + ras-bert.c | 615 +++++++++++++++++++++++++++++++++++++++++++++++++++ + ras-bert.h | 263 ++++++++++++++++++++++ + ras-record.h | 1 + + rasdaemon.c | 10 + + 7 files changed, 1497 insertions(+), 1 deletion(-) + create mode 100644 cper.h + create mode 100644 ras-bert.c + create mode 100644 ras-bert.h + +diff --git a/Makefile.am b/Makefile.am +index c400473..10995c0 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -148,6 +148,9 @@ endif + if WITH_ERST + rasdaemon_SOURCES += ras-erst.c ras-erst-dmesg.c + endif ++if WITH_ERST ++ rasdaemon_SOURCES += ras-bert.c ++endif + + if WITH_NVGPU + BUILT_SOURCES = ras-nvgpu-nvml.h libnvgpudriver.a +@@ -169,7 +172,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ + ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h \ +- ras-kmsg.h ras-time.h ++ ras-kmsg.h ras-time.h cper.h ras-bert.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 46ba36e..5e94219 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -297,6 +297,16 @@ AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) + + AC_SUBST([ZLIBS]) + ++AC_ARG_ENABLE([bert], ++ AS_HELP_STRING([--enable-bert], [enable bert (currently experimental)])) ++ ++AS_IF([test "x$enable_bert" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_BERT,1,"have BERT") ++ AC_SUBST([HAVE_BERT]) ++]) ++AM_CONDITIONAL([WITH_BERT], [test x$enable_bert = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_BERT], [USE_BERT="yes"], [USE_BERT="no"]) ++ + AC_ARG_ENABLE([nvgpu], + AS_HELP_STRING([--enable-nvgpu], [enable NVGPU events])) + +@@ -370,6 +380,7 @@ compile time options summary + JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE + Signal : $USE_SIGNAL + ERST : $USE_ERST ++ BERT : $USE_BERT + NVGPU RAS errors : $USE_NVGPU + Json exporter : $USE_JSON_REPORT + Kmsg monitor : $USE_KMSG_MONITOR +diff --git a/cper.h b/cper.h +new file mode 100644 +index 0000000..858cdb6 +--- /dev/null ++++ b/cper.h +@@ -0,0 +1,593 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++ * UEFI Common Platform Error Record ++ * ++ * Copyright (C) 2010, Intel Corp. ++ * Author: Huang Ying ++ */ ++ ++#ifndef LINUX_CPER_H ++#define LINUX_CPER_H ++ ++/* CPER record signature and the size */ ++#include ++ ++typedef struct { ++ uint8_t b[16]; ++} guid_t; ++ ++#define GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ ++((guid_t) \ ++{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ ++ (b) & 0xff, ((b) >> 8) & 0xff, \ ++ (c) & 0xff, ((c) >> 8) & 0xff, \ ++ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) ++ ++#define CPER_SIG_RECORD "CPER" ++#define CPER_SIG_SIZE 4 ++/* Used in signature_end field in struct cper_record_header */ ++#define CPER_SIG_END 0xffffffff ++ ++/* ++ * CPER record header revision, used in revision field in struct ++ * cper_record_header ++ */ ++#define CPER_RECORD_REV 0x0100 ++ ++/* ++ * CPER record length contains the CPER fields which are relevant for further ++ * handling of a memory error in userspace (we don't carry all the fields ++ * defined in the UEFI spec because some of them don't make any sense.) ++ * Currently, a length of 256 should be more than enough. ++ */ ++#define CPER_REC_LEN 256 ++/* ++ * Severity definition for error_severity in struct cper_record_header ++ * and section_severity in struct cper_section_descriptor ++ */ ++enum { ++ CPER_SEV_RECOVERABLE, ++ CPER_SEV_FATAL, ++ CPER_SEV_CORRECTED, ++ CPER_SEV_INFORMATIONAL, ++}; ++ ++/* ++ * Validation bits definition for validation_bits in struct ++ * cper_record_header. If set, corresponding fields in struct ++ * cper_record_header contain valid information. ++ */ ++#define CPER_VALID_PLATFORM_ID 0x0001 ++#define CPER_VALID_TIMESTAMP 0x0002 ++#define CPER_VALID_PARTITION_ID 0x0004 ++ ++/* ++ * Notification type used to generate error record, used in ++ * notification_type in struct cper_record_header. These UUIDs are defined ++ * in the UEFI spec v2.7, sec N.2.1. ++ */ ++ ++/* Corrected Machine Check */ ++#define CPER_NOTIFY_CMC \ ++ GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \ ++ 0xEB, 0xD4, 0xF8, 0x90) ++/* Corrected Platform Error */ ++#define CPER_NOTIFY_CPE \ ++ GUID_INIT(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81, \ ++ 0xF2, 0x7E, 0xBE, 0xEE) ++/* Machine Check Exception */ ++#define CPER_NOTIFY_MCE \ ++ GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \ ++ 0xE1, 0x49, 0x13, 0xBB) ++/* PCI Express Error */ ++#define CPER_NOTIFY_PCIE \ ++ GUID_INIT(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D, \ ++ 0xAF, 0x67, 0xC1, 0x04) ++/* INIT Record (for IPF) */ ++#define CPER_NOTIFY_INIT \ ++ GUID_INIT(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B, \ ++ 0xD3, 0x9B, 0xC9, 0x8E) ++/* Non-Maskable Interrupt */ ++#define CPER_NOTIFY_NMI \ ++ GUID_INIT(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24, \ ++ 0x85, 0xD6, 0xE9, 0x8A) ++/* BOOT Error Record */ ++#define CPER_NOTIFY_BOOT \ ++ GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \ ++ 0xD4, 0x64, 0xB3, 0x8F) ++/* DMA Remapping Error */ ++#define CPER_NOTIFY_DMAR \ ++ GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ ++ 0x72, 0x2D, 0xEB, 0x41) ++/* CXL Protocol Error Section */ ++#define CPER_SEC_CXL_PROT_ERR \ ++ GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \ ++ 0x4B, 0x77, 0x10, 0x48) ++ ++/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ ++/* ++ * General Media Event Record ++ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 ++ */ ++#define CPER_SEC_CXL_GEN_MEDIA_GUID \ ++ GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ ++ 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) ++/* ++ * DRAM Event Record ++ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 ++ */ ++#define CPER_SEC_CXL_DRAM_GUID \ ++ GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ ++ 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) ++/* ++ * Memory Module Event Record ++ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 ++ */ ++#define CPER_SEC_CXL_MEM_MODULE_GUID \ ++ GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ ++ 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) ++ ++/* ++ * Flags bits definitions for flags in struct cper_record_header ++ * If set, the error has been recovered ++ */ ++#define CPER_HW_ERROR_FLAGS_RECOVERED 0x1 ++/* If set, the error is for previous boot */ ++#define CPER_HW_ERROR_FLAGS_PREVERR 0x2 ++/* If set, the error is injected for testing */ ++#define CPER_HW_ERROR_FLAGS_SIMULATED 0x4 ++ ++/* ++ * CPER section header revision, used in revision field in struct ++ * cper_section_descriptor ++ */ ++#define CPER_SEC_REV 0x0100 ++ ++/* ++ * Validation bits definition for validation_bits in struct ++ * cper_section_descriptor. If set, corresponding fields in struct ++ * cper_section_descriptor contain valid information. ++ */ ++#define CPER_SEC_VALID_FRU_ID 0x1 ++#define CPER_SEC_VALID_FRU_TEXT 0x2 ++ ++/* ++ * Flags bits definitions for flags in struct cper_section_descriptor ++ * ++ * If set, the section is associated with the error condition ++ * directly, and should be focused on ++ */ ++#define CPER_SEC_PRIMARY 0x0001 ++/* ++ * If set, the error was not contained within the processor or memory ++ * hierarchy and the error may have propagated to persistent storage ++ * or network ++ */ ++#define CPER_SEC_CONTAINMENT_WARNING 0x0002 ++/* If set, the component must be re-initialized or re-enabled prior to use */ ++#define CPER_SEC_RESET 0x0004 ++/* If set, Linux may choose to discontinue use of the resource */ ++#define CPER_SEC_ERROR_THRESHOLD_EXCEEDED 0x0008 ++/* ++ * If set, resource could not be queried for error information due to ++ * conflicts with other system software or resources. Some fields of ++ * the section will be invalid ++ */ ++#define CPER_SEC_RESOURCE_NOT_ACCESSIBLE 0x0010 ++/* ++ * If set, action has been taken to ensure error containment (such as ++ * poisoning data), but the error has not been fully corrected and the ++ * data has not been consumed. Linux may choose to take further ++ * corrective action before the data is consumed ++ */ ++#define CPER_SEC_LATENT_ERROR 0x0020 ++ ++/* ++ * Section type definitions, used in section_type field in struct ++ * cper_section_descriptor. These UUIDs are defined in the UEFI spec ++ * v2.7, sec N.2.2. ++ */ ++ ++/* Processor Generic */ ++#define CPER_SEC_PROC_GENERIC \ ++ GUID_INIT(0x9876CCAD, 0x47B4, 0x4bdb, 0xB6, 0x5E, 0x16, 0xF1, \ ++ 0x93, 0xC4, 0xF3, 0xDB) ++/* Processor Specific: X86/X86_64 */ ++#define CPER_SEC_PROC_IA \ ++ GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \ ++ 0x24, 0x2B, 0x6E, 0x1D) ++/* Processor Specific: IA64 */ ++#define CPER_SEC_PROC_IPF \ ++ GUID_INIT(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00, \ ++ 0x80, 0xC7, 0x3C, 0x88, 0x81) ++/* Processor Specific: ARM */ ++#define CPER_SEC_PROC_ARM \ ++ GUID_INIT(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05, \ ++ 0x1D, 0x5D, 0x46, 0xB0) ++/* Platform Memory */ ++#define CPER_SEC_PLATFORM_MEM \ ++ GUID_INIT(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \ ++ 0xED, 0x7C, 0x83, 0xB1) ++#define CPER_SEC_PCIE \ ++ GUID_INIT(0xD995E954, 0xBBC1, 0x430F, 0xAD, 0x91, 0xB4, 0x4D, \ ++ 0xCB, 0x3C, 0x6F, 0x35) ++/* Firmware Error Record Reference */ ++#define CPER_SEC_FW_ERR_REC_REF \ ++ GUID_INIT(0x81212A96, 0x09ED, 0x4996, 0x94, 0x71, 0x8D, 0x72, \ ++ 0x9C, 0x8E, 0x69, 0xED) ++/* PCI/PCI-X Bus */ ++#define CPER_SEC_PCI_X_BUS \ ++ GUID_INIT(0xC5753963, 0x3B84, 0x4095, 0xBF, 0x78, 0xED, 0xDA, \ ++ 0xD3, 0xF9, 0xC9, 0xDD) ++/* PCI Component/Device */ ++#define CPER_SEC_PCI_DEV \ ++ GUID_INIT(0xEB5E4685, 0xCA66, 0x4769, 0xB6, 0xA2, 0x26, 0x06, \ ++ 0x8B, 0x00, 0x13, 0x26) ++#define CPER_SEC_DMAR_GENERIC \ ++ GUID_INIT(0x5B51FEF7, 0xC79D, 0x4434, 0x8F, 0x1B, 0xAA, 0x62, \ ++ 0xDE, 0x3E, 0x2C, 0x64) ++/* Intel VT for Directed I/O specific DMAr */ ++#define CPER_SEC_DMAR_VT \ ++ GUID_INIT(0x71761D37, 0x32B2, 0x45cd, 0xA7, 0xD0, 0xB0, 0xFE, \ ++ 0xDD, 0x93, 0xE8, 0xCF) ++/* IOMMU specific DMAr */ ++#define CPER_SEC_DMAR_IOMMU \ ++ GUID_INIT(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \ ++ 0xDF, 0xAA, 0x84, 0xEC) ++ ++#define CPER_PROC_VALID_TYPE 0x0001 ++#define CPER_PROC_VALID_ISA 0x0002 ++#define CPER_PROC_VALID_ERROR_TYPE 0x0004 ++#define CPER_PROC_VALID_OPERATION 0x0008 ++#define CPER_PROC_VALID_FLAGS 0x0010 ++#define CPER_PROC_VALID_LEVEL 0x0020 ++#define CPER_PROC_VALID_VERSION 0x0040 ++#define CPER_PROC_VALID_BRAND_INFO 0x0080 ++#define CPER_PROC_VALID_ID 0x0100 ++#define CPER_PROC_VALID_TARGET_ADDRESS 0x0200 ++#define CPER_PROC_VALID_REQUESTOR_ID 0x0400 ++#define CPER_PROC_VALID_RESPONDER_ID 0x0800 ++#define CPER_PROC_VALID_IP 0x1000 ++ ++#define CPER_MEM_VALID_ERROR_STATUS 0x0001 ++#define CPER_MEM_VALID_PA 0x0002 ++#define CPER_MEM_VALID_PA_MASK 0x0004 ++#define CPER_MEM_VALID_NODE 0x0008 ++#define CPER_MEM_VALID_CARD 0x0010 ++#define CPER_MEM_VALID_MODULE 0x0020 ++#define CPER_MEM_VALID_BANK 0x0040 ++#define CPER_MEM_VALID_DEVICE 0x0080 ++#define CPER_MEM_VALID_ROW 0x0100 ++#define CPER_MEM_VALID_COLUMN 0x0200 ++#define CPER_MEM_VALID_BIT_POSITION 0x0400 ++#define CPER_MEM_VALID_REQUESTOR_ID 0x0800 ++#define CPER_MEM_VALID_RESPONDER_ID 0x1000 ++#define CPER_MEM_VALID_TARGET_ID 0x2000 ++#define CPER_MEM_VALID_ERROR_TYPE 0x4000 ++#define CPER_MEM_VALID_RANK_NUMBER 0x8000 ++#define CPER_MEM_VALID_CARD_HANDLE 0x10000 ++#define CPER_MEM_VALID_MODULE_HANDLE 0x20000 ++#define CPER_MEM_VALID_ROW_EXT 0x40000 ++#define CPER_MEM_VALID_BANK_GROUP 0x80000 ++#define CPER_MEM_VALID_BANK_ADDRESS 0x100000 ++#define CPER_MEM_VALID_CHIP_ID 0x200000 ++ ++#define CPER_MEM_EXT_ROW_MASK 0x3 ++#define CPER_MEM_EXT_ROW_SHIFT 16 ++ ++#define CPER_MEM_BANK_ADDRESS_MASK 0xff ++#define CPER_MEM_BANK_GROUP_SHIFT 8 ++ ++#define CPER_MEM_CHIP_ID_SHIFT 5 ++ ++#define CPER_PCIE_VALID_PORT_TYPE 0x0001 ++#define CPER_PCIE_VALID_VERSION 0x0002 ++#define CPER_PCIE_VALID_COMMAND_STATUS 0x0004 ++#define CPER_PCIE_VALID_DEVICE_ID 0x0008 ++#define CPER_PCIE_VALID_SERIAL_NUMBER 0x0010 ++#define CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS 0x0020 ++#define CPER_PCIE_VALID_CAPABILITY 0x0040 ++#define CPER_PCIE_VALID_AER_INFO 0x0080 ++ ++#define CPER_PCIE_SLOT_SHIFT 3 ++ ++#define CPER_ARM_VALID_MPIDR BIT(0) ++#define CPER_ARM_VALID_AFFINITY_LEVEL BIT(1) ++#define CPER_ARM_VALID_RUNNING_STATE BIT(2) ++#define CPER_ARM_VALID_VENDOR_INFO BIT(3) ++ ++#define CPER_ARM_INFO_VALID_MULTI_ERR BIT(0) ++#define CPER_ARM_INFO_VALID_FLAGS BIT(1) ++#define CPER_ARM_INFO_VALID_ERR_INFO BIT(2) ++#define CPER_ARM_INFO_VALID_VIRT_ADDR BIT(3) ++#define CPER_ARM_INFO_VALID_PHYSICAL_ADDR BIT(4) ++ ++#define CPER_ARM_INFO_FLAGS_FIRST BIT(0) ++#define CPER_ARM_INFO_FLAGS_LAST BIT(1) ++#define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) ++#define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) ++ ++#define CPER_ARM_CACHE_ERROR 0 ++#define CPER_ARM_TLB_ERROR 1 ++#define CPER_ARM_BUS_ERROR 2 ++#define CPER_ARM_VENDOR_ERROR 3 ++#define CPER_ARM_MAX_TYPE CPER_ARM_VENDOR_ERROR ++ ++#define CPER_ARM_ERR_VALID_TRANSACTION_TYPE BIT(0) ++#define CPER_ARM_ERR_VALID_OPERATION_TYPE BIT(1) ++#define CPER_ARM_ERR_VALID_LEVEL BIT(2) ++#define CPER_ARM_ERR_VALID_PROC_CONTEXT_CORRUPT BIT(3) ++#define CPER_ARM_ERR_VALID_CORRECTED BIT(4) ++#define CPER_ARM_ERR_VALID_PRECISE_PC BIT(5) ++#define CPER_ARM_ERR_VALID_RESTARTABLE_PC BIT(6) ++#define CPER_ARM_ERR_VALID_PARTICIPATION_TYPE BIT(7) ++#define CPER_ARM_ERR_VALID_TIME_OUT BIT(8) ++#define CPER_ARM_ERR_VALID_ADDRESS_SPACE BIT(9) ++#define CPER_ARM_ERR_VALID_MEM_ATTRIBUTES BIT(10) ++#define CPER_ARM_ERR_VALID_ACCESS_MODE BIT(11) ++ ++#define CPER_ARM_ERR_TRANSACTION_SHIFT 16 ++#define CPER_ARM_ERR_TRANSACTION_MASK GENMASK(1,0) ++#define CPER_ARM_ERR_OPERATION_SHIFT 18 ++#define CPER_ARM_ERR_OPERATION_MASK GENMASK(3,0) ++#define CPER_ARM_ERR_LEVEL_SHIFT 22 ++#define CPER_ARM_ERR_LEVEL_MASK GENMASK(2,0) ++#define CPER_ARM_ERR_PC_CORRUPT_SHIFT 25 ++#define CPER_ARM_ERR_PC_CORRUPT_MASK GENMASK(0,0) ++#define CPER_ARM_ERR_CORRECTED_SHIFT 26 ++#define CPER_ARM_ERR_CORRECTED_MASK GENMASK(0,0) ++#define CPER_ARM_ERR_PRECISE_PC_SHIFT 27 ++#define CPER_ARM_ERR_PRECISE_PC_MASK GENMASK(0,0) ++#define CPER_ARM_ERR_RESTARTABLE_PC_SHIFT 28 ++#define CPER_ARM_ERR_RESTARTABLE_PC_MASK GENMASK(0,0) ++#define CPER_ARM_ERR_PARTICIPATION_TYPE_SHIFT 29 ++#define CPER_ARM_ERR_PARTICIPATION_TYPE_MASK GENMASK(1,0) ++#define CPER_ARM_ERR_TIME_OUT_SHIFT 31 ++#define CPER_ARM_ERR_TIME_OUT_MASK GENMASK(0,0) ++#define CPER_ARM_ERR_ADDRESS_SPACE_SHIFT 32 ++#define CPER_ARM_ERR_ADDRESS_SPACE_MASK GENMASK(1,0) ++#define CPER_ARM_ERR_MEM_ATTRIBUTES_SHIFT 34 ++#define CPER_ARM_ERR_MEM_ATTRIBUTES_MASK GENMASK(8,0) ++#define CPER_ARM_ERR_ACCESS_MODE_SHIFT 43 ++#define CPER_ARM_ERR_ACCESS_MODE_MASK GENMASK(0,0) ++ ++/* ++ * All tables and structs must be byte-packed to match CPER ++ * specification, since the tables are provided by the system BIOS ++ */ ++#pragma pack(1) ++ ++/* Record Header, UEFI v2.7 sec N.2.1 */ ++struct cper_record_header { ++ char signature[CPER_SIG_SIZE]; /* must be CPER_SIG_RECORD */ ++ uint16_t revision; /* must be CPER_RECORD_REV */ ++ uint32_t signature_end; /* must be CPER_SIG_END */ ++ uint16_t section_count; ++ uint32_t error_severity; ++ uint32_t validation_bits; ++ uint32_t record_length; ++ uint64_t timestamp; ++ guid_t platform_id; ++ guid_t partition_id; ++ guid_t creator_id; ++ guid_t notification_type; ++ uint64_t record_id; ++ uint32_t flags; ++ uint64_t persistence_information; ++ uint8_t reserved[12]; /* must be zero */ ++}; ++ ++/* Section Descriptor, UEFI v2.7 sec N.2.2 */ ++struct cper_section_descriptor { ++ uint32_t section_offset; /* Offset in bytes of the ++ * section body from the base ++ * of the record header */ ++ uint32_t section_length; ++ uint16_t revision; /* must be CPER_RECORD_REV */ ++ uint8_t validation_bits; ++ uint8_t reserved; /* must be zero */ ++ uint32_t flags; ++ guid_t section_type; ++ guid_t fru_id; ++ uint32_t section_severity; ++ uint8_t fru_text[20]; ++}; ++ ++/* Generic Processor Error Section, UEFI v2.7 sec N.2.4.1 */ ++struct cper_sec_proc_generic { ++ uint64_t validation_bits; ++ uint8_t proc_type; ++ uint8_t proc_isa; ++ uint8_t proc_error_type; ++ uint8_t operation; ++ uint8_t flags; ++ uint8_t level; ++ uint16_t reserved; ++ uint64_t cpu_version; ++ char cpu_brand[128]; ++ uint64_t proc_id; ++ uint64_t target_addr; ++ uint64_t requestor_id; ++ uint64_t responder_id; ++ uint64_t ip; ++}; ++ ++/* IA32/X64 Processor Error Section, UEFI v2.7 sec N.2.4.2 */ ++struct cper_sec_proc_ia { ++ uint64_t validation_bits; ++ uint64_t lapic_id; ++ uint8_t cpuid[48]; ++}; ++ ++/* IA32/X64 Processor Error Information Structure, UEFI v2.7 sec N.2.4.2.1 */ ++struct cper_ia_err_info { ++ guid_t err_type; ++ uint64_t validation_bits; ++ uint64_t check_info; ++ uint64_t target_id; ++ uint64_t requestor_id; ++ uint64_t responder_id; ++ uint64_t ip; ++}; ++ ++/* IA32/X64 Processor Context Information Structure, UEFI v2.7 sec N.2.4.2.2 */ ++struct cper_ia_proc_ctx { ++ uint16_t reg_ctx_type; ++ uint16_t reg_arr_size; ++ uint32_t msr_addr; ++ uint64_t mm_reg_addr; ++}; ++ ++/* ARM Processor Error Section, UEFI v2.7 sec N.2.4.4 */ ++struct cper_sec_proc_arm { ++ uint32_t validation_bits; ++ uint16_t err_info_num; /* Number of Processor Error Info */ ++ uint16_t context_info_num; /* Number of Processor Context Info Records*/ ++ uint32_t section_length; ++ uint8_t affinity_level; ++ uint8_t reserved[3]; /* must be zero */ ++ uint64_t mpidr; ++ uint64_t midr; ++ uint32_t running_state; /* Bit 0 set - Processor running. PSCI = 0 */ ++ uint32_t psci_state; ++}; ++ ++/* ARM Processor Error Information Structure, UEFI v2.7 sec N.2.4.4.1 */ ++struct cper_arm_err_info { ++ uint8_t version; ++ uint8_t length; ++ uint16_t validation_bits; ++ uint8_t type; ++ uint16_t multiple_error; ++ uint8_t flags; ++ uint64_t error_info; ++ uint64_t virt_fault_addr; ++ uint64_t physical_fault_addr; ++}; ++ ++/* ARM Processor Context Information Structure, UEFI v2.7 sec N.2.4.4.2 */ ++struct cper_arm_ctx_info { ++ uint16_t version; ++ uint16_t type; ++ uint32_t size; ++}; ++ ++/* Old Memory Error Section, UEFI v2.1, v2.2 */ ++struct cper_sec_mem_err_old { ++ uint64_t validation_bits; ++ uint64_t error_status; ++ uint64_t physical_addr; ++ uint64_t physical_addr_mask; ++ uint16_t node; ++ uint16_t card; ++ uint16_t module; ++ uint16_t bank; ++ uint16_t device; ++ uint16_t row; ++ uint16_t column; ++ uint16_t bit_pos; ++ uint64_t requestor_id; ++ uint64_t responder_id; ++ uint64_t target_id; ++ uint8_t error_type; ++}; ++ ++/* Memory Error Section (UEFI >= v2.3), UEFI v2.8 sec N.2.5 */ ++struct cper_sec_mem_err { ++ uint64_t validation_bits; ++ uint64_t error_status; ++ uint64_t physical_addr; ++ uint64_t physical_addr_mask; ++ uint16_t node; ++ uint16_t card; ++ uint16_t module; ++ uint16_t bank; ++ uint16_t device; ++ uint16_t row; ++ uint16_t column; ++ uint16_t bit_pos; ++ uint64_t requestor_id; ++ uint64_t responder_id; ++ uint64_t target_id; ++ uint8_t error_type; ++ uint8_t extended; ++ uint16_t rank; ++ uint16_t mem_array_handle; /* "card handle" in UEFI 2.4 */ ++ uint16_t mem_dev_handle; /* "module handle" in UEFI 2.4 */ ++}; ++ ++struct cper_mem_err_compact { ++ uint64_t validation_bits; ++ uint16_t node; ++ uint16_t card; ++ uint16_t module; ++ uint16_t bank; ++ uint16_t device; ++ uint16_t row; ++ uint16_t column; ++ uint16_t bit_pos; ++ uint64_t requestor_id; ++ uint64_t responder_id; ++ uint64_t target_id; ++ uint16_t rank; ++ uint16_t mem_array_handle; ++ uint16_t mem_dev_handle; ++ uint8_t extended; ++}; ++ ++static inline uint32_t cper_get_mem_extension(uint64_t mem_valid, uint8_t mem_extended) ++{ ++ if (!(mem_valid & CPER_MEM_VALID_ROW_EXT)) ++ return 0; ++ return (mem_extended & CPER_MEM_EXT_ROW_MASK) << CPER_MEM_EXT_ROW_SHIFT; ++} ++ ++/* PCI Express Error Section, UEFI v2.7 sec N.2.7 */ ++struct cper_sec_pcie { ++ uint64_t validation_bits; ++ uint32_t port_type; ++ struct { ++ uint8_t minor; ++ uint8_t major; ++ uint8_t reserved[2]; ++ } version; ++ uint16_t command; ++ uint16_t status; ++ uint32_t reserved; ++ struct { ++ uint16_t vendor_id; ++ uint16_t device_id; ++ uint8_t class_code[3]; ++ uint8_t function; ++ uint8_t device; ++ uint16_t segment; ++ uint8_t bus; ++ uint8_t secondary_bus; ++ uint16_t slot; ++ uint8_t reserved; ++ } device_id; ++ struct { ++ uint32_t lower; ++ uint32_t upper; ++ } serial_number; ++ struct { ++ uint16_t secondary_status; ++ uint16_t control; ++ } bridge; ++ uint8_t capability[60]; ++ uint8_t aer_info[96]; ++}; ++ ++/* Firmware Error Record Reference, UEFI v2.7 sec N.2.10 */ ++struct cper_sec_fw_err_rec_ref { ++ uint8_t record_type; ++ uint8_t revision; ++ uint8_t reserved[6]; ++ uint64_t record_identifier; ++ guid_t record_identifier_guid; ++}; ++ ++/* Reset to default packing */ ++#pragma pack() ++ ++#endif +diff --git a/ras-bert.c b/ras-bert.c +new file mode 100644 +index 0000000..de9e1eb +--- /dev/null ++++ b/ras-bert.c +@@ -0,0 +1,615 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "cper.h" ++#include "ras-logger.h" ++#include "ras-bert.h" ++#include "ras-record.h" ++#include "ras-mc-handler.h" ++#include "ras-aer-handler.h" ++ ++#define BERT_HANDLED "/run/rasdaemon_bert" ++#define BERT_DATA "/sys/firmware/acpi/tables/data/BERT" ++#define BERT_TABLE "/sys/firmware/acpi/tables/BERT" ++ ++#define ACPI_BERT_PRINT_MAX_RECORDS 5 ++#define ACPI_BERT_PRINT_MAX_LEN 1024 ++ ++static bool bert_handled(void) ++{ ++ return access(BERT_HANDLED, F_OK) == 0; ++} ++ ++static bool bert_exist(void) ++{ ++ return access(BERT_DATA, F_OK) == 0; ++} ++ ++static void bert_handled_tag(void) ++{ ++ FILE *file = fopen(BERT_HANDLED, "w"); ++ if (file == NULL) { ++ log(SYSLOG, LOG_ERR, "Failed to create BERT tag file"); ++ return; ++ } ++ ++ fclose(file); ++} ++ ++static unsigned bcd2bin(unsigned char val) ++{ ++ return (val & 0x0f) + (val >> 4) * 10; ++} ++ ++static void cper_print_tstamp(char *t, ++ struct acpi_hest_generic_data_v300 *gdata) ++{ ++ uint8_t hour, min, sec, day, mon, year, century, *timestamp; ++ ++ if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) { ++ timestamp = (uint8_t *)&(gdata->time_stamp); ++ sec = bcd2bin(timestamp[0]); ++ min = bcd2bin(timestamp[1]); ++ hour = bcd2bin(timestamp[2]); ++ day = bcd2bin(timestamp[4]); ++ mon = bcd2bin(timestamp[5]); ++ year = bcd2bin(timestamp[6]); ++ century = bcd2bin(timestamp[7]); ++ ++ snprintf(t, 64, ++ "%02d%02d-%02d-%02d %02d:%02d:%02d", ++ century, year, mon, day, hour, min, sec); ++ } ++} ++ ++static int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus) ++{ ++ if (estatus->data_length && ++ estatus->data_length < sizeof(struct acpi_hest_generic_data)) ++ return -EINVAL; ++ if (estatus->raw_data_length && ++ estatus->raw_data_offset < sizeof(*estatus) + estatus->data_length) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int cper_estatus_check(const struct acpi_hest_generic_status *estatus) ++{ ++ struct acpi_hest_generic_data *gdata; ++ unsigned int data_len, record_size; ++ int rc; ++ ++ rc = cper_estatus_check_header(estatus); ++ if (rc) ++ return rc; ++ ++ data_len = estatus->data_length; ++ ++ apei_estatus_for_each_section(estatus, gdata) { ++ if (acpi_hest_get_size(gdata) > data_len) ++ return -EINVAL; ++ ++ record_size = acpi_hest_get_record_size(gdata); ++ if (record_size > data_len) ++ return -EINVAL; ++ ++ data_len -= record_size; ++ } ++ if (data_len) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int cper_mem_err_location(struct cper_sec_mem_err *mem, char *msg) ++{ ++ uint32_t len, n; ++ ++ if (!msg) ++ return 0; ++ ++ n = 0; ++ len = CPER_REC_LEN; ++ if (mem->validation_bits & CPER_MEM_VALID_NODE) ++ n += snprintf(msg + n, len - n, "node:%d ", mem->node); ++ if (mem->validation_bits & CPER_MEM_VALID_CARD) ++ n += snprintf(msg + n, len - n, "card:%d ", mem->card); ++ if (mem->validation_bits & CPER_MEM_VALID_MODULE) ++ n += snprintf(msg + n, len - n, "module:%d ", mem->module); ++ if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER) ++ n += snprintf(msg + n, len - n, "rank:%d ", mem->rank); ++ if (mem->validation_bits & CPER_MEM_VALID_BANK) ++ n += snprintf(msg + n, len - n, "bank:%d ", mem->bank); ++ if (mem->validation_bits & CPER_MEM_VALID_BANK_GROUP) ++ n += snprintf(msg + n, len - n, "bank_group:%d ", ++ mem->bank >> CPER_MEM_BANK_GROUP_SHIFT); ++ if (mem->validation_bits & CPER_MEM_VALID_BANK_ADDRESS) ++ n += snprintf(msg + n, len - n, "bank_address:%d ", ++ mem->bank & CPER_MEM_BANK_ADDRESS_MASK); ++ if (mem->validation_bits & CPER_MEM_VALID_DEVICE) ++ n += snprintf(msg + n, len - n, "device:%d ", mem->device); ++ if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { ++ uint32_t row = mem->row; ++ ++ row |= cper_get_mem_extension(mem->validation_bits, mem->extended); ++ n += snprintf(msg + n, len - n, "row:%d ", row); ++ } ++ if (mem->validation_bits & CPER_MEM_VALID_COLUMN) ++ n += snprintf(msg + n, len - n, "column:%d ", mem->column); ++ if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) ++ n += snprintf(msg + n, len - n, "bit_position:%d ", ++ mem->bit_pos); ++ if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) ++ n += snprintf(msg + n, len - n, "requestor_id:0x%016lx ", ++ mem->requestor_id); ++ if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID) ++ n += snprintf(msg + n, len - n, "responder_id:0x%016lx ", ++ mem->responder_id); ++ if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID) ++ n += snprintf(msg + n, len - n, "target_id:0x%016lx ", ++ mem->target_id); ++ if (mem->validation_bits & CPER_MEM_VALID_CHIP_ID) ++ n += snprintf(msg + n, len - n, "chip_id:%d ", ++ mem->extended >> CPER_MEM_CHIP_ID_SHIFT); ++ ++ return n; ++} ++ ++static const char *cper_mem_err_status_str(uint64_t status) ++{ ++ switch ((status >> 8) & 0xff) { ++ case 1: return "Error detected internal to the component"; ++ case 4: return "Storage error in DRAM memory"; ++ case 5: return "Storage error in TLB"; ++ case 6: return "Storage error in cache"; ++ case 7: return "Error in one or more functional units"; ++ case 8: return "Component failed self test"; ++ case 9: return "Overflow or undervalue of internal queue"; ++ case 16: return "Error detected in the bus"; ++ case 17: return "Virtual address not found on IO-TLB or IO-PDIR"; ++ case 18: return "Improper access error"; ++ case 19: return "Access to a memory address which is not mapped to any component"; ++ case 20: return "Loss of Lockstep"; ++ case 21: return "Response not associated with a request"; ++ case 22: return "Bus parity error - must also set the A, C, or D Bits"; ++ case 23: return "Detection of a protocol error"; ++ case 24: return "Detection of a PATH_ERROR"; ++ case 25: return "Bus operation timeout"; ++ case 26: return "A read was issued to data that has been poisoned"; ++ default: return "Reserved"; ++ } ++} ++ ++static int print_mem_error_other_detail(const struct cper_sec_mem_err *mem, char *msg, ++ const char *location, unsigned int len) ++{ ++ uint32_t n; ++ ++ if (!msg) ++ return 0; ++ ++ n = 0; ++ len -= 1; ++ ++ n += snprintf(msg + n, len - n, "APEI location: %s ", location); ++ ++ if (!(mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)) ++ goto out; ++ ++ n += snprintf(msg + n, len - n, "status(0x%016lx): ", mem->error_status); ++ n += snprintf(msg + n, len - n, "%s ", cper_mem_err_status_str(mem->error_status)); ++ ++out: ++ msg[n] = '\0'; ++ ++ return n; ++} ++ ++static const char *cper_mem_err_type_str(unsigned int etype) ++{ ++ return etype < ARRAY_SIZE(mem_err_type_strs) ? ++ mem_err_type_strs[etype] : "unknown"; ++} ++ ++static void cper_print_mem_error(struct acpi_hest_generic_data *gdata) ++{ ++ struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); ++ struct ras_mc_event e = { 0 }; ++ char location[256]; ++ char other_detail[400]; ++ char msg[80]; ++ char *p; ++ const guid_t *fru_id = NULL; ++ int sec_sev; ++ char *fru_text = ""; ++ struct trace_seq s; ++ size_t len; ++ ++ sec_sev = ghes_severity(gdata->error_severity); ++ if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) ++ fru_id = (guid_t *)gdata->fru_id; ++ ++ if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) ++ fru_text = (char *)gdata->fru_text; ++ ++ cper_print_tstamp(e.timestamp, (struct acpi_hest_generic_data_v300 *)gdata); ++ if (!e.timestamp[0]) { ++ time_t now; ++ struct tm *tm; ++ ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(e.timestamp, sizeof(e.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ } ++ ++ e.label = "unknown memory"; ++ e.severity = sec_sev; ++ e.error_count = 1; ++ e.grain = 1; ++ e.msg = msg; ++ e.driver_detail = other_detail; ++ e.top_layer = -1; ++ e.middle_layer = -1; ++ e.lower_layer = -1; ++ *other_detail = '\0'; ++ *msg = '\0'; ++ switch (sec_sev) { ++ case GHES_SEV_CORRECTED: ++ e.error_type = "Corrected"; ++ break; ++ case GHES_SEV_RECOVERABLE: ++ e.error_type = "Uncorrected"; ++ break; ++ case GHES_SEV_PANIC: ++ e.error_type = "Fatal"; ++ break; ++ default: ++ case GHES_SEV_NO: ++ e.error_type = "Info"; ++ } ++ ++ if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { ++ uint8_t etype = mem_err->error_type; ++ ++ p = msg; ++ p += snprintf(p, sizeof(msg), "%s", cper_mem_err_type_str(etype)); ++ } else { ++ strcpy(msg, "unknown error"); ++ } ++ e.msg = msg; ++ ++ if (mem_err->validation_bits & CPER_MEM_VALID_PA) ++ e.address = mem_err->physical_addr; ++ ++ if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) ++ e.grain = ~mem_err->physical_addr_mask + 1; ++ ++ p = location; ++ p += cper_mem_err_location(mem_err, p); ++ ++ len = p - location; ++ if (len) ++ location[len - 1] = '\0'; ++ ++ /* All other fields are mapped on e->other_detail */ ++ p = other_detail; ++ if (fru_id) ++ p += snprintf(p, 30, "fru_id: %pUl ", fru_id); ++ if (fru_text) ++ p += snprintf(p, 30, "fru_text: %.20s ", fru_text); ++ p += print_mem_error_other_detail(mem_err, p, location, 400); ++ ++ if (p > other_detail) ++ *(p - 1) = '\0'; ++ e.driver_detail = other_detail; ++ ++ trace_seq_init(&s); ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, BERT_MEM_EVENT_NAME); ++ ++ report_mc_event(&e, &s); ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++} ++ ++static int cper_severity_to_aer(int cper_severity) ++{ ++ switch (cper_severity) { ++ case CPER_SEV_RECOVERABLE: ++ return AER_NONFATAL; ++ case CPER_SEV_FATAL: ++ return AER_FATAL; ++ default: ++ return AER_CORRECTABLE; ++ } ++} ++ ++static void cper_print_pcie_aer(struct acpi_hest_generic_data *gdata) ++{ ++ struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata); ++ struct ras_aer_event e = { 0 }; ++ unsigned int devfn; ++ int aer_severity, tlp_header_valid = 0; ++ struct aer_capability_regs aer_info; ++ uint32_t status, mask; ++ struct trace_seq s; ++ char buf[80]; ++ uint32_t tlp_header[4]; ++ ++ if (!(pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID) || ++ !(pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO)) ++ return; ++ ++ cper_print_tstamp(e.timestamp, (struct acpi_hest_generic_data_v300 *)gdata); ++ if (!e.timestamp[0]) { ++ time_t now; ++ struct tm *tm; ++ ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(e.timestamp, sizeof(e.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ } ++ ++ devfn = PCI_DEVFN(pcie_err->device_id.device, ++ pcie_err->device_id.function); ++ aer_severity = cper_severity_to_aer(gdata->error_severity); ++ ++ /* ++ * If firmware reset the component to contain ++ * the error, we must reinitialize it before ++ * use, so treat it as a fatal AER error. ++ */ ++ if (gdata->flags & CPER_SEC_RESET) ++ aer_severity = AER_FATAL; ++ ++ memcpy(&aer_info, pcie_err->aer_info, sizeof(struct aer_capability_regs)); ++ ++ snprintf(buf, 80, "%.4x:%.2x:%.2x.%x", ++ pcie_err->device_id.segment, ++ pcie_err->device_id.bus, ++ PCI_SLOT(devfn), ++ PCI_FUNC(devfn)); ++ e.dev_name = buf; ++ e.severity = aer_severity; ++ ++ if (aer_severity == AER_CORRECTABLE) { ++ status = aer_info.cor_status; ++ mask = aer_info.cor_mask; ++ } else { ++ status = aer_info.uncor_status; ++ mask = aer_info.uncor_mask; ++ tlp_header_valid = status & AER_LOG_TLP_MASKS; ++ } ++ ++ e.status = (status & ~mask); ++ e.tlp_header_valid = tlp_header_valid; ++ tlp_header[0] = aer_info.header_log.dw[0]; ++ tlp_header[1] = aer_info.header_log.dw[1]; ++ tlp_header[2] = aer_info.header_log.dw[2]; ++ tlp_header[3] = aer_info.header_log.dw[3]; ++ e.tlp_header = tlp_header; ++ ++ trace_seq_init(&s); ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, BERT_AER_EVENT_NAME); ++ ++ report_aer_event(&s, &e); ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++} ++ ++static void cper_print_bits(unsigned int bits, ++ const char * const strs[], unsigned int strs_size, struct trace_seq *s) ++{ ++ int i, len = 0; ++ const char *str; ++ char buf[84]; ++ ++ for (i = 0; i < strs_size; i++) { ++ if (!(bits & (1U << i))) ++ continue; ++ str = strs[i]; ++ if (!str) ++ continue; ++ if (!len) ++ len = snprintf(buf, sizeof(buf), "%s", str); ++ else ++ len += snprintf(buf+len, sizeof(buf)-len, ", %s", str); ++ } ++ if (len) ++ trace_seq_printf(s, "%s ", buf); ++} ++ ++static void cper_print_proc_generic(struct acpi_hest_generic_data *gdata) ++{ ++ struct cper_sec_proc_generic *proc = acpi_hest_get_payload(gdata); ++ struct trace_seq s; ++ char timestamp[64]; ++ ++ trace_seq_init(&s); ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, BERT_AER_EVENT_NAME); ++ ++ cper_print_tstamp(timestamp, (struct acpi_hest_generic_data_v300 *)gdata); ++ if (!timestamp[0]) { ++ time_t now; ++ struct tm *tm; ++ ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(timestamp, sizeof(timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ } ++ trace_seq_printf(&s, "%s ", timestamp); ++ ++ if (proc->validation_bits & CPER_PROC_VALID_TYPE) ++ trace_seq_printf(&s, "processor_type: %s ", proc->proc_type < ARRAY_SIZE(proc_type_strs) ? ++ proc_type_strs[proc->proc_type] : "unknown"); ++ if (proc->validation_bits & CPER_PROC_VALID_ISA) ++ trace_seq_printf(&s, "processor_isa: %s ", proc->proc_isa < ARRAY_SIZE(proc_isa_strs) ? ++ proc_isa_strs[proc->proc_isa] : "unknown"); ++ if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) { ++ trace_seq_printf(&s, "error_type: 0x%02x ", proc->proc_error_type); ++ cper_print_bits(proc->proc_error_type, ++ cper_proc_error_type_strs, ++ ARRAY_SIZE(cper_proc_error_type_strs), &s); ++ } ++ if (proc->validation_bits & CPER_PROC_VALID_OPERATION) ++ trace_seq_printf(&s, "operation: %s ", proc->operation < ARRAY_SIZE(proc_op_strs) ? ++ proc_op_strs[proc->operation] : "unknown"); ++ if (proc->validation_bits & CPER_PROC_VALID_FLAGS) { ++ trace_seq_printf(&s, "flags: 0x%02x ", proc->flags); ++ cper_print_bits(proc->flags, proc_flag_strs, ++ ARRAY_SIZE(proc_flag_strs), &s); ++ } ++ if (proc->validation_bits & CPER_PROC_VALID_LEVEL) ++ trace_seq_printf(&s, "level: %d ", proc->level); ++ if (proc->validation_bits & CPER_PROC_VALID_VERSION) ++ trace_seq_printf(&s, "version_info: 0x%016lx ", proc->cpu_version); ++ if (proc->validation_bits & CPER_PROC_VALID_ID) ++ trace_seq_printf(&s, "processor_id: 0x%016lx ", proc->proc_id); ++ if (proc->validation_bits & CPER_PROC_VALID_TARGET_ADDRESS) ++ trace_seq_printf(&s, "target_address: 0x%016lx ", proc->target_addr); ++ if (proc->validation_bits & CPER_PROC_VALID_REQUESTOR_ID) ++ trace_seq_printf(&s, "requestor_id: 0x%016lx ", proc->requestor_id); ++ if (proc->validation_bits & CPER_PROC_VALID_RESPONDER_ID) ++ trace_seq_printf(&s, "responder_id: 0x%016lx ", proc->responder_id); ++ if (proc->validation_bits & CPER_PROC_VALID_IP) ++ trace_seq_printf(&s, "IP: 0x%016lx ", proc->ip); ++} ++ ++static void ++cper_estatus_print_section(struct acpi_hest_generic_data *gdata) ++{ ++ guid_t *sec_type; ++ ++ sec_type = (guid_t *)gdata->section_type; ++ if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { ++ cper_print_mem_error(gdata); ++ } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { ++ cper_print_pcie_aer(gdata); ++ } else if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) { ++ cper_print_proc_generic(gdata); ++ } else { ++ // TODO ++ } ++} ++ ++static void bert_print_all(struct acpi_hest_generic_status *region, ++ unsigned int region_len) ++{ ++ struct acpi_hest_generic_status *estatus = region; ++ int remain = region_len; ++ uint32_t estatus_len; ++ ++ while (remain >= sizeof(struct acpi_hest_generic_status)) { ++ struct acpi_hest_generic_data *gdata; ++ ++ estatus_len = cper_estatus_len(estatus); ++ if (remain < estatus_len) { ++ log(SYSLOG, LOG_ERR, "Truncated status block (length: %u).\n", ++ estatus_len); ++ break; ++ } ++ ++ if (cper_estatus_check(estatus)) { ++ log(SYSLOG, LOG_ERR, "Invalid error record.\n"); ++ break; ++ } ++ ++ apei_estatus_for_each_section(estatus, gdata) { ++ cper_estatus_print_section(gdata); ++ } ++ ++ estatus = (void *)estatus + estatus_len; ++ remain -= estatus_len; ++ } ++} ++ ++static void handle_bert_cper(void) ++{ ++ FILE *bert_file, *bert_data; ++ struct acpi_table_bert bert_table; ++ uint32_t bert_length; ++ void *bert_buffer; ++ ++ bert_file = fopen(BERT_TABLE, "rb"); ++ if (!bert_file) { ++ log(SYSLOG, LOG_ERR, "Failed to open BERT table file"); ++ return; ++ } ++ ++ if (fread(&bert_table, sizeof(bert_table), 1, bert_file) != 1) { ++ log(SYSLOG, LOG_ERR, "Failed to read BERT header\n"); ++ goto free_file; ++ } ++ ++ bert_length = bert_table.region_length; ++ ++ log(SYSLOG, LOG_INFO, "Find BERT table address: %#lx, length %d\n", ++ bert_table.address, bert_table.region_length); ++ bert_data = fopen(BERT_DATA, "rb"); ++ if (!bert_data){ ++ log(SYSLOG, LOG_ERR, "Failed to open data file\n"); ++ goto free_file; ++ } ++ ++ bert_buffer = malloc(bert_length); ++ if (!bert_buffer) { ++ log(SYSLOG, LOG_ERR, "Memory allocation failed\n"); ++ goto free_data; ++ } ++ ++ if (fread(bert_buffer, 1, bert_length, bert_data) != bert_length) { ++ log(SYSLOG, LOG_ERR, "Failed to read full region from data file\n"); ++ goto free_buf; ++ } ++ ++ bert_print_all(bert_buffer, bert_length); ++ ++free_buf: ++ free(bert_buffer); ++free_data: ++ fclose(bert_data); ++free_file: ++ fclose(bert_file); ++ ++ return; ++} ++ ++void handle_bert(void) ++{ ++ if (!bert_exist()) ++ return; ++ ++ if (bert_handled()) ++ return; ++ ++ handle_bert_cper(); ++ ++ bert_handled_tag(); ++} +\ No newline at end of file +diff --git a/ras-bert.h b/ras-bert.h +new file mode 100644 +index 0000000..32d247a +--- /dev/null ++++ b/ras-bert.h +@@ -0,0 +1,263 @@ ++#include ++#include ++#include ++#include "cper.h" ++#include "ras-events.h" ++ ++/********************** ACPI definitions ***********************/ ++ ++#define BERT_MEM_EVENT_NAME "bert_mc_event" ++#define BERT_AER_EVENT_NAME "bert_aer_event" ++#define BERT_CPU_EVENT_NAME "bert_cpu_event" ++ ++#pragma pack(1) ++struct acpi_hest_generic_status { ++ uint32_t block_status; ++ uint32_t raw_data_offset; ++ uint32_t raw_data_length; ++ uint32_t data_length; ++ uint32_t error_severity; ++}; ++ ++struct acpi_hest_generic_data { ++ uint8_t section_type[16]; ++ uint32_t error_severity; ++ uint16_t revision; ++ uint8_t validation_bits; ++ uint8_t flags; ++ uint32_t error_data_length; ++ uint8_t fru_id[16]; ++ uint8_t fru_text[20]; ++}; ++ ++#define ACPI_HEST_GEN_VALID_TIMESTAMP (1<<2) ++struct acpi_hest_generic_data_v300 { ++ uint8_t section_type[16]; ++ uint32_t error_severity; ++ uint16_t revision; ++ uint8_t validation_bits; ++ uint8_t flags; ++ uint32_t error_data_length; ++ uint8_t fru_id[16]; ++ uint8_t fru_text[20]; ++ uint64_t time_stamp; ++}; ++ ++#define ACPI_NAMESEG_SIZE 4 /* Fixed by ACPI spec */ ++#define ACPI_OEM_ID_SIZE 6 ++#define ACPI_OEM_TABLE_ID_SIZE 8 ++ ++struct acpi_table_header { ++ char signature[ACPI_NAMESEG_SIZE]; /* ASCII table signature */ ++ uint32_t length; /* Length of table in bytes, including this header */ ++ uint8_t revision; /* ACPI Specification minor version number */ ++ uint8_t checksum; /* To make sum of entire table == 0 */ ++ char oem_id[ACPI_OEM_ID_SIZE]; /* ASCII OEM identification */ ++ char oem_table_id[ACPI_OEM_TABLE_ID_SIZE]; /* ASCII OEM table identification */ ++ uint32_t oem_revision; /* OEM revision number */ ++ char asl_compiler_id[ACPI_NAMESEG_SIZE]; /* ASCII ASL compiler vendor ID */ ++ uint32_t asl_compiler_revision; /* ASL compiler version */ ++}; ++ ++struct acpi_table_bert { ++ struct acpi_table_header header; /* Common ACPI table header */ ++ uint32_t region_length; /* Length of the boot error region */ ++ uint64_t address; /* Physical address of the error region */ ++}; ++#pragma pack() ++ ++static inline uint32_t cper_estatus_len(struct acpi_hest_generic_status *estatus) ++{ ++ if (estatus->raw_data_length) ++ return estatus->raw_data_offset + \ ++ estatus->raw_data_length; ++ else ++ return sizeof(*estatus) + estatus->data_length; ++} ++ ++static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata) ++{ ++ return gdata->revision >> 8; ++} ++ ++static inline int acpi_hest_get_error_length(struct acpi_hest_generic_data *gdata) ++{ ++ return ((struct acpi_hest_generic_data *)(gdata))->error_data_length; ++} ++ ++static inline int acpi_hest_get_size(struct acpi_hest_generic_data *gdata) ++{ ++ if (acpi_hest_get_version(gdata) >= 3) ++ return sizeof(struct acpi_hest_generic_data_v300); ++ ++ return sizeof(struct acpi_hest_generic_data); ++} ++ ++static inline int acpi_hest_get_record_size(struct acpi_hest_generic_data *gdata) ++{ ++ return (acpi_hest_get_size(gdata) + acpi_hest_get_error_length(gdata)); ++} ++ ++static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) ++{ ++ return (void *)(gdata) + acpi_hest_get_record_size(gdata); ++} ++ ++#define apei_estatus_for_each_section(estatus, section) \ ++ for (section = (struct acpi_hest_generic_data *)(estatus + 1); \ ++ (void *)section - (void *)(estatus + 1) < estatus->data_length; \ ++ section = acpi_hest_get_next(section)) ++ ++static inline int ghes_severity(int severity) ++{ ++ switch (severity) { ++ case CPER_SEV_INFORMATIONAL: ++ return GHES_SEV_NO; ++ case CPER_SEV_CORRECTED: ++ return GHES_SEV_CORRECTED; ++ case CPER_SEV_RECOVERABLE: ++ return GHES_SEV_RECOVERABLE; ++ case CPER_SEV_FATAL: ++ return GHES_SEV_PANIC; ++ default: ++ /* Unknown, go panic */ ++ return GHES_SEV_PANIC; ++ } ++} ++ ++static inline bool guid_equal(const guid_t *u1, const guid_t *u2) ++{ ++ return memcmp(u1, u2, sizeof(guid_t)) == 0; ++} ++ ++static inline void *acpi_hest_get_payload(struct acpi_hest_generic_data *gdata) ++{ ++ if (acpi_hest_get_version(gdata) >= 3) ++ return (void *)(((struct acpi_hest_generic_data_v300 *)(gdata)) + 1); ++ ++ return gdata + 1; ++} ++ ++/********************** Memory Decode definitions ***********************/ ++ ++static const char * const mem_err_type_strs[] = { ++ "unknown", ++ "no error", ++ "single-bit ECC", ++ "multi-bit ECC", ++ "single-symbol chipkill ECC", ++ "multi-symbol chipkill ECC", ++ "master abort", ++ "target abort", ++ "parity error", ++ "watchdog timeout", ++ "invalid address", ++ "mirror Broken", ++ "memory sparing", ++ "scrub corrected error", ++ "scrub uncorrected error", ++ "physical memory map-out event", ++}; ++ ++/********************** PCIE Decode definitions ***********************/ ++ ++#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) ++#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) ++#define PCI_FUNC(devfn) ((devfn) & 0x07) ++/* ++ * AER and DPC capabilities TLP Logging register sizes (PCIe r6.2, sec 7.8.4 ++ * & 7.9.14). ++ */ ++#define PCIE_STD_NUM_TLP_HEADERLOG 4 ++#define PCIE_STD_MAX_TLP_PREFIXLOG 4 ++#define PCIE_STD_MAX_TLP_HEADERLOG (PCIE_STD_NUM_TLP_HEADERLOG + 10) ++ ++#define AER_NONFATAL 0 ++#define AER_FATAL 1 ++#define AER_CORRECTABLE 2 ++#define DPC_FATAL 3 ++ ++/* Advanced Error Reporting */ ++#define PCI_ERR_UNC_POISON_TLP 0x00001000 /* Poisoned TLP */ ++#define PCI_ERR_UNC_COMP_ABORT 0x00008000 /* Completer Abort */ ++#define PCI_ERR_UNC_UNX_COMP 0x00010000 /* Unexpected Completion */ ++#define PCI_ERR_UNC_MALF_TLP 0x00040000 /* Malformed TLP */ ++#define PCI_ERR_UNC_ECRC 0x00080000 /* ECRC Error Status */ ++#define PCI_ERR_UNC_UNSUP 0x00100000 /* Unsupported Request */ ++ ++#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ ++ PCI_ERR_UNC_ECRC| \ ++ PCI_ERR_UNC_UNSUP| \ ++ PCI_ERR_UNC_COMP_ABORT| \ ++ PCI_ERR_UNC_UNX_COMP| \ ++ PCI_ERR_UNC_MALF_TLP) ++ ++struct pcie_tlp_log { ++ union { ++ uint32_t dw[PCIE_STD_MAX_TLP_HEADERLOG]; ++ struct { ++ uint32_t _do_not_use[PCIE_STD_NUM_TLP_HEADERLOG]; ++ uint32_t prefix[PCIE_STD_MAX_TLP_PREFIXLOG]; ++ }; ++ }; ++ uint8_t header_len; /* Length of the Logged TLP Header in DWORDs */ ++ bool flit; /* TLP was logged when in Flit mode */ ++}; ++ ++struct aer_capability_regs { ++ uint32_t header; ++ uint32_t uncor_status; ++ uint32_t uncor_mask; ++ uint32_t uncor_severity; ++ uint32_t cor_status; ++ uint32_t cor_mask; ++ uint32_t cap_control; ++ struct pcie_tlp_log header_log; ++ uint32_t root_command; ++ uint32_t root_status; ++ uint16_t cor_err_source; ++ uint16_t uncor_err_source; ++}; ++ ++/********************** CPU Decode definitions ***********************/ ++ ++static const char * const proc_type_strs[] = { ++ "IA32/X64", ++ "IA64", ++ "ARM", ++ "RISCV", ++}; ++ ++static const char * const proc_isa_strs[] = { ++ "IA32", ++ "IA64", ++ "X64", ++ "ARM A32/T32", ++ "ARM A64", ++ "RISCV32", ++ "RISCV64", ++}; ++ ++static const char * const cper_proc_error_type_strs[] = { ++ "cache error", ++ "TLB error", ++ "bus error", ++ "micro-architectural error", ++}; ++ ++static const char * const proc_op_strs[] = { ++ "unknown or generic", ++ "data read", ++ "data write", ++ "instruction execution", ++}; ++ ++static const char * const proc_flag_strs[] = { ++ "restartable", ++ "precise IP", ++ "overflow", ++ "corrected", ++}; ++ ++/********************** BERT definitions ***********************/ ++void handle_bert(void); +\ No newline at end of file +diff --git a/ras-record.h b/ras-record.h +index d0230f7..4fa229e 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -56,6 +56,7 @@ struct ras_aer_event { + const char *error_type; + char *dev_name; + int severity; ++ unsigned long long status; + uint8_t tlp_header_valid; + uint32_t *tlp_header; + const char *msg; +diff --git a/rasdaemon.c b/rasdaemon.c +index 335c047..0c8a212 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -11,6 +11,7 @@ + #include + #include + ++#include "ras-bert.h" + #include "ras-erst.h" + #include "ras-events.h" + #include "ras-logger.h" +@@ -253,6 +254,15 @@ int main(int argc, char *argv[]) + else + handle_erst(); + #endif ++ ++#ifdef HAVE_BERT ++ if (choices_disable && strlen(choices_disable) != 0 && ++ strstr(choices_disable, "ras:bert")) ++ log(ALL, LOG_INFO, "Disabled ras:bert from config\n"); ++ else ++ handle_bert(); ++#endif ++ + if (getenv(PCIE_EDPC_ENABLE) && atoi(getenv(PCIE_EDPC_ENABLE))) + config_pcie_edpc(); + else +-- +2.43.5 + diff --git a/1032-config-update-config.patch b/1032-config-update-config.patch new file mode 100644 index 0000000..84f042e --- /dev/null +++ b/1032-config-update-config.patch @@ -0,0 +1,58 @@ +From 8dfa8f9a58a89e028e2e7ceb4f39976ed8e39d9f Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 15 Jul 2025 13:52:35 +0800 +Subject: [PATCH 2/3] config: update config + +- fix preun script +- update init script + +Signed-off-by: Ruidong Tian +--- + contrib/rasdaemon.init | 9 ++++++++- + misc/rasdaemon.spec.in | 6 ++++-- + 2 files changed, 12 insertions(+), 3 deletions(-) + +diff --git a/contrib/rasdaemon.init b/contrib/rasdaemon.init +index 5fde6c8..d42a87a 100644 +--- a/contrib/rasdaemon.init ++++ b/contrib/rasdaemon.init +@@ -18,11 +18,18 @@ case "$target" in + sed -i 's/^NVGPU_TRIGGER=.*/NVGPU_TRIGGER="nvgpu_reset_trigger"/g' ${ENV_PATH} + ;; + jituan) ++ sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} ++ sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH} + sed -i 's/json_report,kmsg_monitor,//' ${ENV_PATH} + sed -i 's/^AMDGPU_MCA_ENABLED=.*/AMDGPU_MCA_ENABLED=1/g' ${ENV_PATH} ++ sed -i '/^Restart=always$/a \ ++ MemoryLimit=2048M\ ++ CPUShares=1024\ ++ CPUQuota=200%\ ++ Slice=infra.slice' /usr/lib/systemd/system/rasdaemon.service + exit 1 + ;; +- zhuanyou) ++ zhuanyou) + sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} + sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH} + ;; +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index bf4cc4b..7498e05 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -113,8 +113,10 @@ systemctl daemon-reload + systemctl restart %{name}.service + + %preun +-systemctl stop %{name}.service +-systemctl disable %{name}.service ++if [ $1 -eq 0 ] ; then ++ systemctl stop %{name}.service ++ systemctl disable %{name}.service ++fi + + %postun + if systemctl is-active --quiet syslog-ng.service; then +-- +2.43.5 + diff --git a/1033-test-add-testsuite.patch b/1033-test-add-testsuite.patch new file mode 100644 index 0000000..ada37ed --- /dev/null +++ b/1033-test-add-testsuite.patch @@ -0,0 +1,214 @@ +From ba3c0e89f875c173ae8ce1bd530342997b2dd1d0 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 15 Jul 2025 14:03:43 +0800 +Subject: [PATCH 3/3] test: add testsuite + +Signed-off-by: Ruidong Tian +--- + test/config.sh | 3 + + test/erst/dmesg-erst-20240804172210 | 14 ++++ + test/erst/dmesg-erst-20240915182515 | 64 ++++++++++++++++++ + test/erst/dmesg-erst-20241010151212 | 87 +++++++++++++++++++++++++ + 13 files changed, 168 insertions(+) + create mode 100644 test/config.sh + create mode 100644 test/erst/dmesg-erst-20240804172210 + create mode 100644 test/erst/dmesg-erst-20240915182515 + create mode 100644 test/erst/dmesg-erst-20241010151212 + +diff --git a/test/config.sh b/test/config.sh +new file mode 100644 +index 0000000..bd5a9bf +--- /dev/null ++++ b/test/config.sh +@@ -0,0 +1,3 @@ ++#!/bin/sh ++ ++sed -i 's/#define ERST_PATH "/sys/fs/pstore/erst"/#define ERST_PATH "'"$pwd/erst"'"/g' ../ras-erst.h +\ No newline at end of file +diff --git a/test/erst/dmesg-erst-20240804172210 b/test/erst/dmesg-erst-20240804172210 +new file mode 100644 +index 0000000..e8981f8 +--- /dev/null ++++ b/test/erst/dmesg-erst-20240804172210 +@@ -0,0 +1,14 @@ ++Panic#1 Part1 ++[ 9421.953530] mce_notify_irq: 3 callbacks suppressed ++[ 9421.953531] mce: [Hardware Error]: Machine check events logged ++[ 9422.544568] mce: [Hardware Error]: Machine check events logged ++[ 9426.540595] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 ++[ 9426.540596] {1}[Hardware Error]: event severity: fatal ++[ 9426.540596] {1}[Hardware Error]: Error 0, type: fatal ++[ 9426.540597] {1}[Hardware Error]: fru_text: Card01, ChnA, DIMM0 ++[ 9426.540597] {1}[Hardware Error]: section_type: memory error ++[ 9426.540598] {1}[Hardware Error]: error_status: 0x0000000000000000 ++[ 9426.540598] {1}[Hardware Error]: physical_address: 0x0000004bc3b04840 ++[ 9426.540599] {1}[Hardware Error]: node: 0 card: 0 module: 0 rank: 0 bank: 3 row: 10782 column: 144 ++[ 9426.540599] {1}[Hardware Error]: DIMM location: not present. DMI handle: 0x0000 ++[ 9426.540600] Kernel panic - not syncing: Fatal hardware error! +diff --git a/test/erst/dmesg-erst-20240915182515 b/test/erst/dmesg-erst-20240915182515 +new file mode 100644 +index 0000000..9dba41a +--- /dev/null ++++ b/test/erst/dmesg-erst-20240915182515 +@@ -0,0 +1,64 @@ ++Panic#1 Part1 ++[ 895.433370] mce: [Hardware Error]: Machine check events logged ++[ 895.434040] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0 ++[ 895.442943] {2}[Hardware Error]: It has been corrected by h/w and requires no further action ++[ 895.451988] {2}[Hardware Error]: event severity: corrected ++[ 895.457846] {2}[Hardware Error]: Error 0, type: corrected ++[ 895.463684] {2}[Hardware Error]: fru_text: Socket1 ++[ 895.468912] {2}[Hardware Error]: section_type: general processor error ++[ 895.475964] {2}[Hardware Error]: processor_type: 0, IA32/X64 ++[ 895.482139] {2}[Hardware Error]: processor_isa: 2, X64 ++[ 895.487817] {2}[Hardware Error]: error_type: 0x08 ++[ 895.493051] {2}[Hardware Error]: micro-architectural error ++[ 895.499071] {2}[Hardware Error]: operation: 0, unknown or generic ++[ 895.505696] {2}[Hardware Error]: version_info: 0x00000000000606a6 ++[ 895.512321] {2}[Hardware Error]: processor_id: 0x00000000000000a4 ++[ 904.593972] {3}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 ++[ 904.593973] {3}[Hardware Error]: event severity: fatal ++[ 904.593973] {3}[Hardware Error]: Error 0, type: fatal ++[ 904.593973] {3}[Hardware Error]: fru_text: Socket1 ++[ 904.593974] {3}[Hardware Error]: section_type: general processor error ++[ 904.593975] {3}[Hardware Error]: processor_type: 0, IA32/X64 ++[ 904.593975] {3}[Hardware Error]: processor_isa: 2, X64 ++[ 904.593975] {3}[Hardware Error]: error_type: 0x08 ++[ 904.593976] {3}[Hardware Error]: micro-architectural error ++[ 904.593976] {3}[Hardware Error]: operation: 0, unknown or generic ++[ 904.593976] {3}[Hardware Error]: version_info: 0x00000000000606a6 ++[ 904.593976] {3}[Hardware Error]: processor_id: 0x00000000000000a4 ++[ 904.593977] Kernel panic - not syncing: Fatal hardware error! ++[ 904.593978] Call Trace: ++[ 904.593978] ++[ 904.593978] dump_stack+0x66/0x8b ++[ 904.593978] panic+0x10b/0x268 ++[ 904.593978] ? __ghes_print_estatus+0x6c/0xb0 ++[ 904.593979] __ghes_panic+0x66/0x71 ++[ 904.593979] ghes_notify_nmi+0x3db/0x420 ++[ 904.593979] ? __intel_pmu_enable_all.constprop.35+0x47/0x80 ++[ 904.593979] ? native_apic_msr_write+0x27/0x30 ++[ 904.593980] ? intel_pmu_handle_irq+0xf5/0x2b0 ++[ 904.593980] ? nmi_handle+0x6c/0x110 ++[ 904.593980] nmi_handle+0x6c/0x110 ++[ 904.593980] default_do_nmi+0x4e/0x100 ++[ 904.593980] do_nmi+0x102/0x160 ++[ 904.593981] end_repeat_nmi+0x16/0x55 ++[ 904.593981] RIP: 0010:machine_check+0x0/0x40 ++[ 904.593982] Code: 00 00 48 89 e7 48 8b 74 24 78 48 c7 44 24 78 ff ff ff ff e8 82 cc 66 ff e9 5d 02 00 00 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 <0f> 01 ca 6a ff f6 44 24 10 03 75 14 e8 2f 00 00 00 48 89 e7 31 f6 ++[ 904.593982] RSP: 0018:fffffe000063bfd8 EFLAGS: 00000046 ++[ 904.593983] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001 ++[ 904.593983] RDX: 0000000000000000 RSI: ffffffff822e0de0 RDI: 0000000000000024 ++[ 904.593983] RBP: 0000000000000001 R08: 0000000000000002 R09: 0000000000022080 ++[ 904.593984] R10: 0000000000000018 R11: 00000000000002d2 R12: ffff893ee2244680 ++[ 904.593984] R13: 0000000000000001 R14: 0000000000000001 R15: ffffffff822e0de0 ++[ 904.593984] ? async_page_fault+0x30/0x30 ++[ 904.593984] ? async_page_fault+0x30/0x30 ++[ 904.593985] ++[ 904.593985] <#MC> ++[ 904.593985] RIP: 0010:intel_idle+0x82/0x130 ++[ 904.593986] Code: 04 25 c0 5b 01 00 48 89 d1 0f 01 c8 48 8b 00 a8 08 75 17 e9 07 00 00 00 0f 00 2d 95 40 5b 00 b9 01 00 00 00 48 89 d8 0f 01 c9 <65> 48 8b 04 25 c0 5b 01 00 f0 80 60 02 df f0 83 44 24 fc 00 48 8b ++[ 904.593986] RSP: 0018:ffffc90018c0fe50 EFLAGS: 00000046 ++[ 904.593986] ++[ 904.593986] cpuidle_enter_state+0x77/0x320 ++[ 904.593986] do_idle+0x1f6/0x270 ++[ 904.593987] cpu_startup_entry+0x6f/0x80 ++[ 904.593987] start_secondary+0x186/0x1e0 ++[ 904.593987] secondary_startup_64+0xa4/0xb0 +diff --git a/test/erst/dmesg-erst-20241010151212 b/test/erst/dmesg-erst-20241010151212 +new file mode 100644 +index 0000000..8d46fac +--- /dev/null ++++ b/test/erst/dmesg-erst-20241010151212 +@@ -0,0 +1,87 @@ ++Panic#1 Part1 ++[ 895.986336] diagnose-tools in diagnosis_exit ++[ 905.369492] {3}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 ++[ 905.369534] {3}[Hardware Error]: event severity: fatal ++[ 905.369534] {3}[Hardware Error]: Error 0, type: fatal ++[ 905.369534] {3}[Hardware Error]: section_type: PCIe error ++[ 905.369535] {3}[Hardware Error]: port_type: 0, PCIe end point ++[ 905.369535] {3}[Hardware Error]: version: 3.0 ++[ 905.369536] {3}[Hardware Error]: command: 0x0546, status: 0xc110 ++[ 905.369536] {3}[Hardware Error]: device_id: 0000:b1:00.0 ++[ 905.369536] {3}[Hardware Error]: slot: 0 ++[ 905.369536] {3}[Hardware Error]: secondary_bus: 0x00 ++[ 905.369537] {3}[Hardware Error]: vendor_id: 0x144d, device_id: 0xa80a ++[ 905.369537] {3}[Hardware Error]: class_code: 010802 ++[ 905.369538] {3}[Hardware Error]: aer_uncor_status: 0x00001000, aer_uncor_mask: 0x00104000 ++[ 905.369538] {3}[Hardware Error]: aer_uncor_severity: 0x00463030 ++[ 905.369538] {3}[Hardware Error]: TLP Header: 4a004040 00000100 b1000400 d1010000 ++[ 905.369539] Kernel panic - not syncing: Fatal hardware error! ++[ 905.369540] Call Trace: ++[ 905.369540] ++[ 905.369540] dump_stack+0x57/0x6a ++[ 905.369541] panic+0x11d/0x2e7 ++[ 905.369541] sdei_api_restore_ras+0x26/0x26 ++[ 905.369541] ghes_in_nmi_queue_one_entry.constprop.0+0x1fd/0x2c0 ++[ 905.369541] ghes_notify_nmi.part.0+0x38/0xa0 ++[ 905.369541] nmi_handle+0x52/0xf0 ++[ 905.369542] default_do_nmi+0x42/0x130 ++[ 905.369542] exc_nmi+0x11f/0x150 ++[ 905.369542] end_repeat_nmi+0x16/0x55 ++[ 905.369542] RIP: 0010:mwait_idle_with_hints+0x49/0x90 ++[ 905.369543] Code: 89 d1 65 48 8b 04 25 80 ef 01 00 0f 01 c8 48 8b 00 a8 08 75 15 e9 07 00 00 00 0f 00 2d 70 57 fb 00 48 89 f8 48 89 f1 0f 01 c9 <65> 48 8b 04 25 80 ef 01 00 f0 80 60 02 df f0 83 44 24 fc 00 48 8b ++[ 905.369543] RSP: 0018:ffffffff9ea03e50 EFLAGS: 00000046 ++[ 905.369544] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001 ++[ 905.369544] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 ++[ 905.369544] RBP: ffff936a85399800 R08: ffffffff9f1eea60 R09: 0000000000000001 ++[ 905.369545] R10: ffff93687f2324e4 R11: 0000000000000dcf R12: 0000000000000001 ++[ 905.369545] R13: ffffffff9f1eeae0 R14: 0000000000000001 R15: 0000000000000000 ++[ 905.369545] ? mwait_idle_with_hints+0x49/0x90 ++[ 905.369545] ? mwait_idle_with_hints+0x49/0x90 ++[ 905.369545] ++[ 905.369546] acpi_idle_enter+0x7d/0xb0 ++[ 905.369546] cpuidle_enter_state+0x84/0x330 ++[ 905.369546] cpuidle_enter+0x29/0x40 ++[ 905.369546] cpuidle_idle_call+0x128/0x1b0 ++[ 905.369546] do_idle+0x72/0xd0 ++[ 905.369547] cpu_startup_entry+0x19/0x20 ++[ 905.369547] start_kernel+0x44c/0x46b ++[ 905.369547] secondary_startup_64_no_verify+0xb0/0xbb ++[ 905.369547] Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ++[ 905.369548] unchecked MSR access error: WRMSR to 0x83f (tried to write 0x00000000000000f6) at rIP: 0xffffffff9d060503 (native_apic_msr_write+0x23/0x40) ++[ 905.369548] Call Trace: ++[ 905.369548] ++[ 905.369548] arch_irq_work_raise+0x20/0x30 ++[ 905.369549] irq_work_queue+0x26/0x30 ++[ 905.369549] vprintk_emit+0xd4/0x120 ++[ 905.369549] vprintk_deferred+0x1b/0x50 ++[ 905.369549] printk_deferred+0x58/0x6f ++[ 905.369549] printk_safe_flush_buffer.cold+0x3b/0x70 ++[ 905.369550] __printk_safe_flush+0x4f/0xb0 ++[ 905.369550] printk_safe_flush+0x37/0x70 ++[ 905.369550] panic+0x162/0x2e7 ++[ 905.369551] sdei_api_restore_ras+0x26/0x26 ++[ 905.369551] ghes_in_nmi_queue_one_entry.constprop.0+0x1fd/0x2c0 ++[ 905.369551] ghes_notify_nmi.part.0+0x38/0xa0 ++[ 905.369552] nmi_handle+0x52/0xf0 ++[ 905.369552] default_do_nmi+0x42/0x130 ++[ 905.369553] exc_nmi+0x11f/0x150 ++[ 905.369553] end_repeat_nmi+0x16/0x55 ++[ 905.369553] RIP: 0010:mwait_idle_with_hints+0x49/0x90 ++[ 905.369553] Code: 89 d1 65 48 8b 04 25 80 ef 01 00 0f 01 c8 48 8b 00 a8 08 75 15 e9 07 00 00 00 0f 00 2d 70 57 fb 00 48 89 f8 48 89 f1 0f 01 c9 <65> 48 8b 04 25 80 ef 01 00 f0 80 60 02 df f0 83 44 24 fc 00 48 8b ++[ 905.369554] RSP: 0018:ffffffff9ea03e50 EFLAGS: 00000046 ++[ 905.369554] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001 ++[ 905.369555] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 ++[ 905.369555] RBP: ffff936a85399800 R08: ffffffff9f1eea60 R09: 0000000000000001 ++[ 905.369555] R10: ffff93687f2324e4 R11: 0000000000000dcf R12: 0000000000000001 ++[ 905.369555] R13: ffffffff9f1eeae0 R14: 0000000000000001 R15: 0000000000000000 ++[ 905.369556] ? mwait_idle_with_hints+0x49/0x90 ++[ 905.369556] ? mwait_idle_with_hints+0x49/0x90 ++[ 905.369556] ++[ 905.369557] acpi_idle_enter+0x7d/0xb0 ++[ 905.369557] cpuidle_enter_state+0x84/0x330 ++[ 905.369557] cpuidle_enter+0x29/0x40 ++[ 905.369557] cpuidle_idle_call+0x128/0x1b0 ++[ 905.369557] do_idle+0x72/0xd0 ++[ 905.369558] cpu_startup_entry+0x19/0x20 ++[ 905.369558] start_kernel+0x44c/0x46b ++[ 905.369558] secondary_startup_64_no_verify+0xb0/0xbb + +-- +2.43.5 + diff --git a/1034-ext-fix-pcihp-filter.patch b/1034-ext-fix-pcihp-filter.patch new file mode 100644 index 0000000..8602938 --- /dev/null +++ b/1034-ext-fix-pcihp-filter.patch @@ -0,0 +1,26 @@ +From 7a375d37256cb851e99d3b2dfc8c78c9f05485ed Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 15 Jul 2025 19:00:17 +0800 +Subject: [PATCH] ext: fix pcihp filter + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.syslog-ng-ext.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/misc/rasdaemon.syslog-ng-ext.in b/misc/rasdaemon.syslog-ng-ext.in +index ad001d2..15e67c4 100644 +--- a/misc/rasdaemon.syslog-ng-ext.in ++++ b/misc/rasdaemon.syslog-ng-ext.in +@@ -38,7 +38,7 @@ rewrite r_cmcistorm { + + filter f_pciehp { + facility(kern) and +- match("pciehp: Slot" value("MESSAGE")); ++ match("pciehp: Slot\\(" value("MESSAGE")); + }; + + rewrite r_pciehp { +-- +2.43.5 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 95e0501..54eb512 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.1 +%define anolis_release .1 Name: rasdaemon Version: 0.8.3 Release: 1%{anolis_release}%{?dist} @@ -35,6 +35,11 @@ Patch1026: 1026-anolis-fix-systemd-config.patch Patch1027: 1027-anolis-add-nvgpu-driver.patch Patch1028: 1028-anolis-add-trigger-for-nvgpu-event.patch Patch1029: 1029-anolis-add-nvgpu-reset-trigger.patch +Patch1030: 1030-rasdaemon-split-report-function.patch +Patch1031: 1031-rasdaemon-support-BERT-decode.patch +Patch1032: 1032-config-update-config.patch +Patch1033: 1033-test-add-testsuite.patch +Patch1034: 1034-ext-fix-pcihp-filter.patch ExcludeArch: s390 s390x BuildRequires: make @@ -108,6 +113,11 @@ mv open-gpu-kernel-modules-* open-gpu-kernel-modules %patch1027 -p1 %patch1028 -p1 %patch1029 -p1 +%patch1030 -p1 +%patch1031 -p1 +%patch1032 -p1 +%patch1033 -p1 +%patch1034 -p1 autoreconf -vfi @@ -188,8 +198,10 @@ systemctl daemon-reload systemctl restart %{name}.service %preun -systemctl stop %{name}.service -systemctl disable %{name}.service +if [ $1 -eq 0 ] ; then + systemctl stop %{name}.service + systemctl disable %{name}.service +fi %postun if systemctl is-active --quiet syslog-ng.service; then @@ -209,6 +221,10 @@ if [ -d "%{_sysconfdir}/logrotate.d" ]; then fi %changelog +* Tue Jul 15 2025 Ruidong Tian - 0.8.3-1.1 +- support bert decode +- update rasdaemon config + * Thu Mar 20 2025 wangzhe - 0.8.3-1.0.1 - update to 0.8.3 - support mc event stat -- Gitee