From f9eb9d8c579d787162447a2ef8e540f25c25454b Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 25 May 2022 17:15:14 +0100 Subject: [PATCH] rasdaemon: Update with the latest patches for the CPU fault isolation, Hisilicon Kunpeng9xx common error records and improvements in the ras-mc-ctl for the Hisilicon Kunpeng9xx errors Update with the latest patches for the 1. CPU online fault isolation for arm event. 2. Modify recording Hisilicon common error data in the rasdaemon 3. In the ras-mc-ctl, 3.1. Improve Hisilicon common error statistics. 3.2. Add support to display the HiSilicon vendor-errors for a specified module. 3.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options. 3.4. Reformat error info of the HiSilicon Kunpeng920. 3.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. 3.6. Updated the HiSilicon platform name as KunPeng9xx. 4. Fixed a memory out-of-bounds issue in the rasdaemon. Signed-off-by: Shiju Jose --- ...t-cpu-fault-isolation-for-corrected-.patch | 174 +++++++++++------- ...t-cpu-fault-isolation-for-recoverabl.patch | 52 ++++-- ...-recording-Hisilicon-common-error-da.patch | 20 +- ...-ctl-Modify-error-statistics-for-HiS.patch | 18 +- ...-ctl-Reformat-error-info-of-the-HiSi.patch | 12 +- ...-ctl-Add-printing-usage-if-necessary.patch | 15 +- ...-ctl-Add-support-to-display-the-HiSi.patch | 112 +++++++++-- ...-ctl-Relocate-reading-and-display-Ku.patch | 44 ++--- ...-ctl-Updated-HiSilicon-platform-name.patch | 127 +++++++++++++ ...r-a-memory-out-of-bounds-issue-and-o.patch | 90 +++++++++ rasdaemon.spec | 39 +++- 11 files changed, 533 insertions(+), 170 deletions(-) rename 0001-Support-cpu-fault-isolation-for-corrected-errors.patch => 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch (86%) rename 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch => 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch (69%) rename 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch => 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch (93%) rename 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch => 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch (90%) rename 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch => 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch (88%) rename 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch => 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch (55%) rename 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch => 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch (70%) rename 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch => 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch (83%) create mode 100644 0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch create mode 100644 0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch diff --git a/0001-Support-cpu-fault-isolation-for-corrected-errors.patch b/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch similarity index 86% rename from 0001-Support-cpu-fault-isolation-for-corrected-errors.patch rename to 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch index d5460de..d17fb21 100644 --- a/0001-Support-cpu-fault-isolation-for-corrected-errors.patch +++ b/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch @@ -1,34 +1,38 @@ -From a8e02e7d3d910eb7d049fd4126d53b8d3121d798 Mon Sep 17 00:00:00 2001 +From b9999d40d73dfff8b1cfb515f3b81b2c2891f6a7 Mon Sep 17 00:00:00 2001 From: Shengwei Luo Date: Wed, 23 Feb 2022 17:21:58 +0800 -Subject: [PATCH 1/2] Support cpu fault isolation for corrected errors +Subject: [PATCH 01/10] rasdaemon: Support cpu fault isolation for corrected + errors When the corrected errors exceed the set limit in cycle, try to offline the related cpu core. Signed-off-by: Shengwei Luo +Signed-off-by: Junchong Pan +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose --- Makefile.am | 6 +- configure.ac | 11 ++ misc/rasdaemon.env | 17 ++ - queue.c | 121 ++++++++++++++ + queue.c | 119 ++++++++++++++ queue.h | 39 +++++ - ras-arm-handler.c | 84 ++++++++++ - ras-arm-handler.h | 18 +++ - ras-cpu-isolation.c | 378 ++++++++++++++++++++++++++++++++++++++++++++ + ras-arm-handler.c | 97 +++++++++++ + ras-arm-handler.h | 18 ++ + ras-cpu-isolation.c | 388 ++++++++++++++++++++++++++++++++++++++++++++ ras-cpu-isolation.h | 68 ++++++++ ras-events.c | 9 +- - 10 files changed, 749 insertions(+), 2 deletions(-) + 10 files changed, 770 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c create mode 100644 ras-cpu-isolation.h diff --git a/Makefile.am b/Makefile.am -index fabca78..242ceb7 100644 +index a322b9a..36e7d4e 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -63,13 +63,17 @@ endif +@@ -69,13 +69,17 @@ endif if WITH_AMP_NS_DECODE rasdaemon_SOURCES += non-standard-ampere.c endif @@ -48,7 +52,7 @@ index fabca78..242ceb7 100644 # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac -index 33b81fe..d098fcf 100644 +index a77991f..e0ed751 100644 --- a/configure.ac +++ b/configure.ac @@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], @@ -102,10 +106,10 @@ index 12fd766..7cb18e8 100644 \ No newline at end of file diff --git a/queue.c b/queue.c new file mode 100644 -index 0000000..ed66798 +index 0000000..65b6fb8 --- /dev/null +++ b/queue.c -@@ -0,0 +1,121 @@ +@@ -0,0 +1,119 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -137,7 +141,6 @@ index 0000000..ed66798 + struct link_queue *queue = NULL; + + queue = (struct link_queue *)malloc(sizeof(struct link_queue)); -+ + if (queue == NULL) { + log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); + return NULL; @@ -218,7 +221,6 @@ index 0000000..ed66798 + struct queue_node *node = NULL; + + node = (struct queue_node *)malloc(sizeof(struct queue_node)); -+ + if (node != NULL) { + node->time = time; + node->value = value; @@ -273,7 +275,7 @@ index 0000000..5459f40 + +#endif diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 1149dc6..c9ef2fd 100644 +index 1149dc6..9c7a3c3 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -22,6 +22,10 @@ @@ -287,7 +289,7 @@ index 1149dc6..c9ef2fd 100644 void display_raw_data(struct trace_seq *s, const uint8_t *buf, -@@ -42,6 +46,44 @@ void display_raw_data(struct trace_seq *s, +@@ -42,6 +46,93 @@ void display_raw_data(struct trace_seq *s, } } @@ -327,18 +329,14 @@ index 1149dc6..c9ef2fd 100644 + log(TERM, LOG_INFO, "%d error in cpu core catched\n", num); + return num; +} -+#endif + - int ras_arm_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context) -@@ -139,6 +181,48 @@ int ras_arm_event_handler(struct trace_seq *s, - display_raw_data(s, ev.vsei_error, ev.oem_len); - #endif - -+#ifdef HAVE_CPU_FAULT_ISOLATION ++static int ras_handle_cpu_error(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, ++ struct ras_arm_event *ev, time_t now) ++{ ++ unsigned long long val; + int cpu; -+ int nums; + char *severity; + struct error_info err_info; + @@ -368,7 +366,8 @@ index 1149dc6..c9ef2fd 100644 + trace_seq_printf(s, "\n severity: %s", severity); + + if (val == GHES_SEV_CORRECTED) { -+ nums = count_errors(&ev); ++ int nums = count_errors(ev); ++ + if (nums > 0) { + err_info.nums = nums; + err_info.time = now; @@ -376,6 +375,29 @@ index 1149dc6..c9ef2fd 100644 + ras_record_cpu_error(&err_info, cpu); + } + } ++ ++ return 0; ++} ++#endif ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context) +@@ -52,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_arm_event ev; + int len = 0; ++ + memset(&ev, 0, sizeof(ev)); + + /* +@@ -139,6 +231,11 @@ int ras_arm_event_handler(struct trace_seq *s, + display_raw_data(s, ev.vsei_error, ev.oem_len); + #endif + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0) ++ return -1; +#endif + /* Insert data into the SGBD */ @@ -412,10 +434,10 @@ index 563a2d3..52813e7 100644 struct event_format *event, void *context); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..8c0cdf9 +index 0000000..abcf451 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,378 @@ +@@ -0,0 +1,388 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -441,6 +463,16 @@ index 0000000..8c0cdf9 +#include "ras-logger.h" +#include "ras-cpu-isolation.h" + ++#define SECOND_OF_MON (30 * 24 * 60 * 60) ++#define SECOND_OF_DAY (24 * 60 * 60) ++#define SECOND_OF_HOU (60 * 60) ++#define SECOND_OF_MIN (60) ++ ++#define LIMIT_OF_CPU_THRESHOLD 10000 ++#define INIT_OF_CPU_THRESHOLD 18 ++#define DEC_CHECK 10 ++#define LAST_BIT_OF_UL 5 ++ +static struct cpu_info *cpu_infos; +static unsigned int ncores; +static unsigned int enabled = 1; @@ -452,9 +484,9 @@ index 0000000..8c0cdf9 +}; + +static const struct param cycle_units[] = { -+ {"d", 24 * 60 * 60}, -+ {"h", 60 * 60}, -+ {"m", 60}, ++ {"d", SECOND_OF_DAY}, ++ {"h", SECOND_OF_HOU}, ++ {"m", SECOND_OF_MIN}, + {"s", 1}, + {} +}; @@ -462,8 +494,8 @@ index 0000000..8c0cdf9 +static struct isolation_param threshold = { + .name = "CPU_CE_THRESHOLD", + .units = normal_units, -+ .value = 18, -+ .limit = 10000 ++ .value = INIT_OF_CPU_THRESHOLD, ++ .limit = LIMIT_OF_CPU_THRESHOLD +}; + +static struct isolation_param cpu_limit = { @@ -474,8 +506,8 @@ index 0000000..8c0cdf9 +static struct isolation_param cycle = { + .name = "CPU_ISOLATION_CYCLE", + .units = cycle_units, -+ .value = 24 * 60 * 60, -+ .limit = 30 * 24 * 60 * 60 ++ .value = SECOND_OF_DAY, ++ .limit = SECOND_OF_MON +}; + +static const char * const cpu_state[] = { @@ -488,13 +520,17 @@ index 0000000..8c0cdf9 +static int open_sys_file(unsigned int cpu, int __oflag, const char *format) +{ + int fd; -+ char buf[MAX_PATH_LEN] = ""; -+ -+ snprintf(buf, sizeof(buf), format, cpu); -+ fd = open(buf, __oflag); ++ char path[MAX_PATH_LEN + 1] = ""; ++ char real_path[MAX_PATH_LEN + 1] = ""; + ++ snprintf(path, sizeof(path), format, cpu); ++ if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path); ++ return -1; ++ } ++ fd = open(real_path, __oflag); + if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, real_path); + return -1; + } + @@ -522,7 +558,6 @@ index 0000000..8c0cdf9 +{ + ncores = cpus; + cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); -+ + if (!cpu_infos) { + log(TERM, LOG_ERR, + "Failed to allocate memory for cpu infos in %s.\n", __func__); @@ -576,34 +611,33 @@ index 0000000..8c0cdf9 + + for (int i = 0; i < env_size; ++i) { + if (isdigit(env[i])) { -+ if (*value > ULONG_MAX / 10 || -+ (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { ++ if (*value > ULONG_MAX / DEC_CHECK || ++ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) { + log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); + return -1; + } -+ *value = 10 * (*value) + (env[i] - '0'); ++ *value = DEC_CHECK * (*value) + (env[i] - '0'); + } else + return -1; + } + -+ if (has_unit) { -+ for (const struct param *units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ if (*value > (ULONG_MAX / units->value)) { -+ log(TERM, LOG_ERR, -+ "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = (*value) * units->value; -+ return 0; ++ if (!has_unit) ++ return 0; ++ ++ for (const struct param *units = config->units; units->name; units++) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ if (*value > (ULONG_MAX / units->value)) { ++ log(TERM, LOG_ERR, ++ "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; + } ++ *value = (*value) * units->value; ++ return 0; + } -+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); -+ return -1; + } -+ -+ return 0; ++ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); ++ return -1; +} + +static void init_config(struct isolation_param *config) @@ -612,7 +646,7 @@ index 0000000..8c0cdf9 + unsigned long value = 0; + + if (parse_ul_config(config, env, &value) < 0) { -+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", ++ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n", + config->name, env, config->value); + return; + } @@ -667,9 +701,8 @@ index 0000000..8c0cdf9 + + strcpy(buf, "0"); + rc = write(fd, buf, strlen(buf)); -+ + if (rc < 0) { -+ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ log(TERM, LOG_ERR, "cpu%u offline failed, errno:%d\n", cpu, errno); + close(fd); + return HANDLE_FAILED; + } @@ -706,7 +739,7 @@ index 0000000..8c0cdf9 + + if (cpu_infos[cpu].ce_nums >= threshold.value) { + log(TERM, LOG_INFO, -+ "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", ++ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n", + threshold.value, cpu); + return do_cpu_offline(cpu); + } @@ -757,7 +790,7 @@ index 0000000..8c0cdf9 + + if (cpu >= ncores || cpu < 0) { + log(TERM, LOG_ERR, -+ "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); ++ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores); + return; + } + @@ -782,7 +815,6 @@ index 0000000..8c0cdf9 + } + + ret = error_handler(cpu, err_info); -+ + if (ret == HANDLE_NOTHING) + log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); + else if (ret == HANDLE_SUCCEED) { @@ -869,10 +901,10 @@ index 0000000..1159853 + +#endif diff --git a/ras-events.c b/ras-events.c -index ba769d1..491c17a 100644 +index 39cab20..beda655 100644 --- a/ras-events.c +++ b/ras-events.c -@@ -41,6 +41,7 @@ +@@ -42,6 +42,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-page-isolation.h" @@ -880,7 +912,7 @@ index ba769d1..491c17a 100644 /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -879,6 +880,10 @@ int handle_ras_events(int record_events) +@@ -856,6 +857,10 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); @@ -891,7 +923,7 @@ index ba769d1..491c17a 100644 #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -1005,6 +1010,8 @@ err: +@@ -982,6 +987,8 @@ err: } free(ras); } @@ -902,5 +934,5 @@ index ba769d1..491c17a 100644 return rc; } -- -2.27.0 +2.25.1 diff --git a/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch b/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch similarity index 69% rename from 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch rename to 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch index aa1b251..e401fa9 100644 --- a/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch +++ b/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch @@ -1,23 +1,35 @@ -From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001 +From fefa2d689f96302e64ad2375695703039e2ca951 Mon Sep 17 00:00:00 2001 From: Shengwei Luo Date: Wed, 23 Feb 2022 17:23:27 +0800 -Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors +Subject: [PATCH 02/10] rasdaemon: Support cpu fault isolation for recoverable + errors When the recoverable errors in cpu core occurred, try to offline the related cpu core. Signed-off-by: Shengwei Luo +Signed-off-by: Junchong Pan +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose --- - ras-arm-handler.c | 21 ++++++++++++++++++--- + ras-arm-handler.c | 22 +++++++++++++++++++--- ras-cpu-isolation.c | 17 +++++++++++++++++ ras-cpu-isolation.h | 4 +++- - 3 files changed, 38 insertions(+), 4 deletions(-) + 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index c9ef2fd..dae5ad6 100644 +index 9c7a3c3..a0dfc51 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s, +@@ -26,6 +26,7 @@ + + #define ARM_ERR_VALID_ERROR_COUNT BIT(0) + #define ARM_ERR_VALID_FLAGS BIT(1) ++#define BIT2 2 + + void display_raw_data(struct trace_seq *s, + const uint8_t *buf, +@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s, } #ifdef HAVE_CPU_FAULT_ISOLATION @@ -30,7 +42,7 @@ index c9ef2fd..dae5ad6 100644 + * Bit 0\1\3: (at lease 1) + * Bit 2: 0 + */ -+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2)); ++ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2)); + } + return 0; +} @@ -39,7 +51,7 @@ index c9ef2fd..dae5ad6 100644 { struct ras_arm_err_info *err_info; int num_pei; -@@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev) +@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev) */ error_count = err_info->multiple_error + 1; } @@ -48,22 +60,22 @@ index c9ef2fd..dae5ad6 100644 num += error_count; err_info += 1; -@@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s, } trace_seq_printf(s, "\n severity: %s", severity); - if (val == GHES_SEV_CORRECTED) { -- nums = count_errors(&ev); +- int nums = count_errors(ev); + if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { -+ nums = count_errors(&ev, val); ++ int nums = count_errors(ev, val); + if (nums > 0) { err_info.nums = nums; - err_info.time = now; diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index 8c0cdf9..e650022 100644 +index abcf451..fd23e4e 100644 --- a/ras-cpu-isolation.c +++ b/ras-cpu-isolation.c -@@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus) +@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus) for (unsigned int i = 0; i < cpus; ++i) { cpu_infos[i].ce_nums = 0; @@ -71,14 +83,14 @@ index 8c0cdf9..e650022 100644 cpu_infos[i].state = get_cpu_status(i); cpu_infos[i].ce_queue = init_queue(); -@@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu) +@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu) return HANDLE_NOTHING; } +static int do_uce_handler(unsigned int cpu) +{ + if (cpu_infos[cpu].uce_nums > 0) { -+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu); ++ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu); + return do_cpu_offline(cpu); + } + return HANDLE_NOTHING; @@ -87,7 +99,7 @@ index 8c0cdf9..e650022 100644 static int error_handler(unsigned int cpu, struct error_info *err_info) { int ret = HANDLE_NOTHING; -@@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) +@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) case CE: ret = do_ce_handler(cpu); break; @@ -97,7 +109,7 @@ index 8c0cdf9..e650022 100644 default: break; } -@@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) +@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) cpu_infos[cpu].ce_nums += err_info->nums; break; } @@ -107,7 +119,7 @@ index 8c0cdf9..e650022 100644 default: break; } -@@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) +@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) cpu, cpu_state[cpu_infos[cpu].state]); clear_queue(cpu_infos[cpu].ce_queue); cpu_infos[cpu].ce_nums = 0; @@ -134,5 +146,5 @@ index 1159853..024a68b 100644 struct link_queue *ce_queue; enum cpu_state state; -- -2.27.0 +2.25.1 diff --git a/0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch b/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch similarity index 93% rename from 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch rename to 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch index d15a714..c51e35a 100644 --- a/0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +++ b/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch @@ -1,7 +1,7 @@ -From 62218a9c3aec44330ce3b77f3634c788b6e6f60c Mon Sep 17 00:00:00 2001 +From 9c4665f33c39ea84db7d69079ab27205d2fbd07e Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 2 Mar 2022 12:20:40 +0000 -Subject: [PATCH 1/6] rasdaemon: Modify recording Hisilicon common error data +Subject: [PATCH 03/10] rasdaemon: Modify recording Hisilicon common error data The error statistics for the Hisilicon common error need to do based on module, error severity etc. @@ -11,11 +11,11 @@ in the sql db table instead of the combined single field. Signed-off-by: Shiju Jose --- - non-standard-hisilicon.c | 122 ++++++++++++++++++++++++++++++++------- - 1 file changed, 102 insertions(+), 20 deletions(-) + non-standard-hisilicon.c | 126 ++++++++++++++++++++++++++++++++------- + 1 file changed, 104 insertions(+), 22 deletions(-) diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 1432163..dc69d46 100644 +index 1432163..d1e1774 100644 --- a/non-standard-hisilicon.c +++ b/non-standard-hisilicon.c @@ -17,6 +17,7 @@ @@ -53,11 +53,15 @@ index 1432163..dc69d46 100644 char reg_msg[HISI_BUF_LEN]; }; -@@ -134,12 +148,24 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) +@@ -132,14 +146,26 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) + + #ifdef HAVE_SQLITE3 static const struct db_fields hisi_common_section_fields[] = { - { .name = "id", .type = "INTEGER PRIMARY KEY" }, - { .name = "timestamp", .type = "TEXT" }, +- { .name = "id", .type = "INTEGER PRIMARY KEY" }, +- { .name = "timestamp", .type = "TEXT" }, - { .name = "err_info", .type = "TEXT" }, ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, + { .name = "version", .type = "INTEGER" }, + { .name = "soc_id", .type = "INTEGER" }, + { .name = "socket_id", .type = "INTEGER" }, diff --git a/0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch b/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch similarity index 90% rename from 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch rename to 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch index 7f7eb24..8963d91 100644 --- a/0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +++ b/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch @@ -1,10 +1,10 @@ -From 4d9f297028ce3116eaf574b2570d71a4ed666b7d Mon Sep 17 00:00:00 2001 +From 4f706ff3b1a04de3be506a309e153b99e04b3445 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Thu, 24 Feb 2022 18:02:14 +0000 -Subject: [PATCH 2/6] rasdaemon: ras-mc-ctl: Modify error statistics for - HiSilicon Kunpeng9xx common errors +Subject: [PATCH 04/10] rasdaemon: ras-mc-ctl: Modify error statistics for + HiSilicon KunPeng9xx common errors -Modify the error statistics for the HiSilicon Kunpeng9xx platforms common errors +Modify the error statistics for the HiSilicon KunPeng9xx platforms common errors to display the statistics and error info based on the module and the error severity. Signed-off-by: Shiju Jose @@ -13,10 +13,10 @@ Signed-off-by: Shiju Jose 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 1e3aeb7..22ba1fd 100755 +index b22dd60..08eb287 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1535,7 +1535,7 @@ sub vendor_errors_summary +@@ -1537,7 +1537,7 @@ sub vendor_errors_summary require DBI; my ($num_args, $platform_id); my ($query, $query_handle, $count, $out); @@ -25,7 +25,7 @@ index 1e3aeb7..22ba1fd 100755 $num_args = $#ARGV + 1; $platform_id = 0; -@@ -1612,13 +1612,18 @@ sub vendor_errors_summary +@@ -1614,13 +1614,18 @@ sub vendor_errors_summary # HiSilicon Kunpeng9xx common errors if ($platform_id eq HISILICON_KUNPENG_9XX) { @@ -47,7 +47,7 @@ index 1e3aeb7..22ba1fd 100755 } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -@@ -1636,8 +1641,8 @@ sub vendor_errors +@@ -1638,8 +1643,8 @@ sub vendor_errors require DBI; my ($num_args, $platform_id); my ($query, $query_handle, $id, $timestamp, $out); @@ -58,7 +58,7 @@ index 1e3aeb7..22ba1fd 100755 $num_args = $#ARGV + 1; $platform_id = 0; -@@ -1725,15 +1730,28 @@ sub vendor_errors +@@ -1727,15 +1732,28 @@ sub vendor_errors # HiSilicon Kunpeng9xx common errors if ($platform_id eq HISILICON_KUNPENG_9XX) { diff --git a/0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch b/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch similarity index 88% rename from 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch rename to 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch index 7600b58..2ff9537 100644 --- a/0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +++ b/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch @@ -1,7 +1,7 @@ -From eb93d77b417b58cba27799ae85747b8a193cf063 Mon Sep 17 00:00:00 2001 +From f5c3c03039be28bb6b5bbe00e12e9586b19a1060 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 5 Mar 2022 16:18:55 +0000 -Subject: [PATCH 3/6] rasdaemon: ras-mc-ctl: Reformat error info of the +Subject: [PATCH 05/10] rasdaemon: ras-mc-ctl: Reformat error info of the HiSilicon Kunpeng920 Reformat the code to display the error info of HiSilicon Kunpeng920. @@ -12,10 +12,10 @@ Signed-off-by: Shiju Jose 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 22ba1fd..eeaf885 100755 +index 08eb287..8755b6f 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1669,8 +1669,9 @@ sub vendor_errors +@@ -1671,8 +1671,9 @@ sub vendor_errors $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); $out .= "module_id=$module_id, " if ($module_id); $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); @@ -27,7 +27,7 @@ index 22ba1fd..eeaf885 100755 } if ($out ne "") { print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -@@ -1692,8 +1693,9 @@ sub vendor_errors +@@ -1694,8 +1695,9 @@ sub vendor_errors $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); $out .= "module_id=$module_id, " if ($module_id); $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); @@ -39,7 +39,7 @@ index 22ba1fd..eeaf885 100755 } if ($out ne "") { print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -@@ -1717,8 +1719,9 @@ sub vendor_errors +@@ -1719,8 +1721,9 @@ sub vendor_errors $out .= "core_id=$core_id, " if ($core_id); $out .= "port_id=$port_id, " if ($port_id); $out .= "err_severity=$err_severity, " if ($err_severity); diff --git a/0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch b/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch similarity index 55% rename from 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch rename to 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch index 15ab710..1ff38e3 100644 --- a/0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +++ b/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch @@ -1,10 +1,11 @@ -From 623e85c07ab21ccc89ffe2bb444eb000a2664a9d Mon Sep 17 00:00:00 2001 +From d595a9d61f9d8341a5e30d4d800e3237d6e0f390 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 5 Mar 2022 17:01:35 +0000 -Subject: [PATCH 4/6] rasdaemon: ras-mc-ctl: Add printing usage if necessary - parameters are not passed for the HiSilicon vendor-errors options +Subject: [PATCH 06/10] rasdaemon: ras-mc-ctl: Add printing usage if necessary + parameters are not passed for the vendor-error options -Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options of the ras-mc-ctl. +Add printing usage if necessary parameters are not passed +for the vendor-errors options. Signed-off-by: Shiju Jose --- @@ -12,10 +13,10 @@ Signed-off-by: Shiju Jose 1 file changed, 2 insertions(+) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index eeaf885..0e32cb1 100755 +index 8755b6f..959ea6b 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1542,6 +1542,7 @@ sub vendor_errors_summary +@@ -1544,6 +1544,7 @@ sub vendor_errors_summary if ($num_args ne 0) { $platform_id = $ARGV[0]; } else { @@ -23,7 +24,7 @@ index eeaf885..0e32cb1 100755 return; } -@@ -1649,6 +1650,7 @@ sub vendor_errors +@@ -1651,6 +1652,7 @@ sub vendor_errors if ($num_args ne 0) { $platform_id = $ARGV[0]; } else { diff --git a/0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch b/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch similarity index 70% rename from 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch rename to 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch index 6153a85..6af2ad0 100644 --- a/0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +++ b/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch @@ -1,52 +1,105 @@ -From 4007c95f8a8d570542ffc11676b619ea5649d0e7 Mon Sep 17 00:00:00 2001 +From 0643011831e5fb4e81edff16ad55f9a5196ec7a9 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 5 Mar 2022 18:19:38 +0000 -Subject: [PATCH 5/6] rasdaemon: ras-mc-ctl: Add support to display the +Subject: [PATCH 07/10] rasdaemon: ras-mc-ctl: Add support to display the HiSilicon vendor errors for a specified module Add support to display the HiSilicon vendor errors for a specified module. Signed-off-by: Shiju Jose --- - util/ras-mc-ctl.in | 119 ++++++++++++++++++++++++--------------------- - 1 file changed, 63 insertions(+), 56 deletions(-) + util/ras-mc-ctl.in | 145 +++++++++++++++++++++++++++------------------ + 1 file changed, 87 insertions(+), 58 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 0e32cb1..d728300 100755 +index 959ea6b..296eb87 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -96,7 +96,8 @@ Usage: $prog [OPTIONS...] +@@ -96,8 +96,9 @@ Usage: $prog [OPTIONS...] --errors Shows the errors stored at the error database. --error-count Shows the corrected and uncorrected error counts using sysfs. --vendor-errors-summary Presents a summary of the vendor-specific logged errors. - --vendor-errors Shows the vendor-specific errors stored in the error database. +- --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. + --vendor-errors Shows the vendor-specific errors stored in the error database. + --vendor-errors Shows the vendor-specific errors for a specific module stored in the error database. - --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. ++ --vendor-platforms List the supported platforms with platform-ids for the vendor-specific errors. --help This help message. EOF -@@ -1640,15 +1641,19 @@ sub vendor_errors_summary + +@@ -1535,12 +1536,14 @@ use constant { + sub vendor_errors_summary + { + require DBI; +- my ($num_args, $platform_id); ++ my ($num_args, $platform_id, $found_platform); + my ($query, $query_handle, $count, $out); + my ($module_id, $sub_module_id, $err_severity, $err_sev); + + $num_args = $#ARGV + 1; + $platform_id = 0; ++ $found_platform = 0; ++ + if ($num_args ne 0) { + $platform_id = $ARGV[0]; + } else { +@@ -1552,6 +1555,7 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng920 errors + if ($platform_id eq HISILICON_KUNPENG_920) { ++ $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1615,6 +1619,7 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { ++ $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1636,21 +1641,31 @@ sub vendor_errors_summary + $query_handle->finish; + } + ++ if ($platform_id && !($found_platform)) { ++ print "Platform ID $platform_id is not valid\n"; ++ } ++ + undef($dbh); + } + sub vendor_errors { require DBI; - my ($num_args, $platform_id); -+ my ($num_args, $platform_id, $module); ++ my ($num_args, $platform_id, $found_platform, $module, $found_module); my ($query, $query_handle, $id, $timestamp, $out); my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); $num_args = $#ARGV + 1; $platform_id = 0; ++ $found_platform = 0; + $module = 0; ++ $found_module = 0; if ($num_args ne 0) { $platform_id = $ARGV[0]; + if ($num_args gt 1) { + $module = $ARGV[1]; -+ } ++ } } else { usage(1); return; -@@ -1664,21 +1669,21 @@ sub vendor_errors +@@ -1660,27 +1675,29 @@ sub vendor_errors + + # HiSilicon Kunpeng920 errors + if ($platform_id eq HISILICON_KUNPENG_920) { ++ $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); $out = ""; while($query_handle->fetch()) { @@ -60,7 +113,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_severity=$err_severity, " if ($err_severity); - $out .= "Error Registers: $regs " if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -71,6 +124,7 @@ index 0e32cb1..d728300 100755 + $out .= "err_severity=$err_severity, " if ($err_severity); + $out .= "Error Registers: $regs " if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { @@ -80,7 +134,7 @@ index 0e32cb1..d728300 100755 } $query_handle->finish; -@@ -1688,21 +1693,21 @@ sub vendor_errors +@@ -1690,21 +1707,22 @@ sub vendor_errors $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); $out = ""; while($query_handle->fetch()) { @@ -94,7 +148,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_severity=$err_severity, " if ($err_severity); - $out .= "Error Registers: $regs " if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -105,6 +159,7 @@ index 0e32cb1..d728300 100755 + $out .= "err_severity=$err_severity, " if ($err_severity); + $out .= "Error Registers: $regs " if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { @@ -114,7 +169,7 @@ index 0e32cb1..d728300 100755 } $query_handle->finish; -@@ -1712,23 +1717,23 @@ sub vendor_errors +@@ -1714,51 +1732,56 @@ sub vendor_errors $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs)); $out = ""; while($query_handle->fetch()) { @@ -130,7 +185,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_type=$err_type, " if ($err_type); - $out .= "Error Registers: $regs " if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($sub_module_id && ($module eq $sub_module_id))) { ++ if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -143,6 +198,7 @@ index 0e32cb1..d728300 100755 + $out .= "err_type=$err_type, " if ($err_type); + $out .= "Error Registers: $regs " if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { @@ -152,7 +208,13 @@ index 0e32cb1..d728300 100755 } $query_handle->finish; } -@@ -1741,22 +1746,24 @@ sub vendor_errors + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { ++ $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); $out = ""; while($query_handle->fetch()) { @@ -172,7 +234,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_severity=$err_severity, " if ($err_severity); - $out .= "Error Registers: $regs" if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -189,10 +251,24 @@ index 0e32cb1..d728300 100755 + $out .= "err_severity=$err_severity, " if ($err_severity); + $out .= "Error Registers: $regs" if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +@@ -1768,6 +1791,12 @@ sub vendor_errors + $query_handle->finish; + } + ++ if ($platform_id && !($found_platform)) { ++ print "Platform ID $platform_id is not valid\n"; ++ } elsif ($module && !($found_module)) { ++ print "No error record for the module $module\n"; ++ } ++ + undef($dbh); + } + -- 2.25.1 diff --git a/0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch b/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch similarity index 83% rename from 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch rename to 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch index 073d335..0453e04 100644 --- a/0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +++ b/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch @@ -1,21 +1,21 @@ -From 88bf3126312645843152c6c3215b54b120bcc1ec Mon Sep 17 00:00:00 2001 +From 2f23b5dc6e5831c8ef2e179bb936e13502f75041 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Mon, 7 Mar 2022 12:38:45 +0000 -Subject: [PATCH 6/6] rasdaemon: ras-mc-ctl: Relocate reading and display +Subject: [PATCH 08/10] rasdaemon: ras-mc-ctl: Relocate reading and display Kunpeng920 errors to under Kunpeng9xx Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. Signed-off-by: Shiju Jose --- - util/ras-mc-ctl.in | 38 ++++++++++---------------------------- - 1 file changed, 10 insertions(+), 28 deletions(-) + util/ras-mc-ctl.in | 40 ++++++++++------------------------------ + 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index d728300..2ab9602 100755 +index 296eb87..75981a0 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1527,7 +1527,6 @@ sub errors +@@ -1529,7 +1529,6 @@ sub errors # Definitions of the vendor platform IDs. use constant { @@ -23,18 +23,18 @@ index d728300..2ab9602 100755 HISILICON_KUNPENG_9XX => "Kunpeng9xx", }; -@@ -1549,8 +1548,8 @@ sub vendor_errors_summary +@@ -1553,8 +1552,8 @@ sub vendor_errors_summary my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx common errors ++ # HiSilicon Kunpeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1565,9 +1564,7 @@ sub vendor_errors_summary +@@ -1570,9 +1569,7 @@ sub vendor_errors_summary $out .= "\t$module_id: $count\n"; } if ($out ne "") { @@ -45,7 +45,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1585,9 +1582,7 @@ sub vendor_errors_summary +@@ -1590,9 +1587,7 @@ sub vendor_errors_summary $out .= "\t$module_id: $count\n"; } if ($out ne "") { @@ -56,7 +56,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1605,15 +1600,10 @@ sub vendor_errors_summary +@@ -1610,16 +1605,10 @@ sub vendor_errors_summary $out .= "\t$sub_module_id: $count\n"; } if ($out ne "") { @@ -70,10 +70,11 @@ index d728300..2ab9602 100755 - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $found_platform = 1; $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; $query_handle = $dbh->prepare($query); $query_handle->execute(); -@@ -1629,8 +1619,6 @@ sub vendor_errors_summary +@@ -1635,8 +1624,6 @@ sub vendor_errors_summary } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; @@ -82,18 +83,18 @@ index d728300..2ab9602 100755 } $query_handle->finish; } -@@ -1661,8 +1649,8 @@ sub vendor_errors +@@ -1673,8 +1660,8 @@ sub vendor_errors my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx common errors ++ # HiSilicon Kunpeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1683,7 +1671,7 @@ sub vendor_errors +@@ -1697,7 +1684,7 @@ sub vendor_errors } } if ($out ne "") { @@ -102,7 +103,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1707,7 +1695,7 @@ sub vendor_errors +@@ -1722,7 +1709,7 @@ sub vendor_errors } } if ($out ne "") { @@ -111,7 +112,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1733,13 +1721,10 @@ sub vendor_errors +@@ -1749,14 +1736,10 @@ sub vendor_errors } } if ($out ne "") { @@ -123,10 +124,11 @@ index d728300..2ab9602 100755 - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $found_platform = 1; $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; $query_handle = $dbh->prepare($query); $query_handle->execute(); -@@ -1767,8 +1752,6 @@ sub vendor_errors +@@ -1785,8 +1768,6 @@ sub vendor_errors } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events:\n$out\n"; @@ -135,7 +137,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; } -@@ -1779,7 +1762,6 @@ sub vendor_errors +@@ -1803,7 +1784,6 @@ sub vendor_errors sub vendor_platforms { print "\nSupported platforms for the vendor-specific errors:\n"; diff --git a/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch b/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch new file mode 100644 index 0000000..e34f89f --- /dev/null +++ b/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch @@ -0,0 +1,127 @@ +From df6011fed2bb45989f9e5c2ea30b33937b08d06c Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 28 Apr 2022 18:58:43 +0100 +Subject: [PATCH 09/10] rasdaemon: ras-mc-ctl: Updated HiSilicon platform name + +Updated the HiSilicon platform name as KunPeng9xx. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 75981a0..1cc19b3 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1529,7 +1529,7 @@ sub errors + + # Definitions of the vendor platform IDs. + use constant { +- HISILICON_KUNPENG_9XX => "Kunpeng9xx", ++ HISILICON_KUNPENG_9XX => "KunPeng9xx", + }; + + sub vendor_errors_summary +@@ -1552,7 +1552,7 @@ sub vendor_errors_summary + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng9xx errors ++ # HiSilicon KunPeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; +@@ -1569,7 +1569,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type1 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1587,7 +1587,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type2 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1605,7 +1605,7 @@ sub vendor_errors_summary + $out .= "\t$sub_module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx PCIe controller error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1623,7 +1623,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx common error events summary:\n$out\n"; + } + $query_handle->finish; + } +@@ -1660,7 +1660,7 @@ sub vendor_errors + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng9xx errors ++ # HiSilicon KunPeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; +@@ -1684,7 +1684,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type1 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1709,7 +1709,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type2 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1736,7 +1736,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx PCIe controller error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1767,7 +1767,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx common error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx common error events:\n$out\n"; + } + $query_handle->finish; + } +@@ -1784,7 +1784,7 @@ sub vendor_errors + sub vendor_platforms + { + print "\nSupported platforms for the vendor-specific errors:\n"; +- print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; ++ print "\tHiSilicon KunPeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; + print "\n"; + } + +-- +2.25.1 + diff --git a/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch b/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch new file mode 100644 index 0000000..48a62cc --- /dev/null +++ b/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch @@ -0,0 +1,90 @@ +From c019f2f82b7f224e95968037f2afc16f63cc1d1d Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 28 Apr 2022 22:59:04 +0100 +Subject: [PATCH 10/10] rasdaemon: Fix for a memory out-of-bounds issue and + optimized code to remove duplicate function. + +Fixed a memory out-of-bounds issue with string pointers and +optimized code structure to remove duplicate function. + +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose +--- + non-standard-hisi_hip08.c | 6 +++--- + non-standard-hisilicon.c | 2 +- + ras-non-standard-handler.c | 16 +--------------- + 3 files changed, 5 insertions(+), 19 deletions(-) + +diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c +index 9092183..4ef47ea 100644 +--- a/non-standard-hisi_hip08.c ++++ b/non-standard-hisi_hip08.c +@@ -1014,15 +1014,15 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, + + static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { + { +- .sec_type = "1f8161e155d641e6bd107afd1dc5f7c5", ++ .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5", + .decode = decode_hip08_oem_type1_error, + }, + { +- .sec_type = "45534ea6ce2341158535e07ab3aef91d", ++ .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d", + .decode = decode_hip08_oem_type2_error, + }, + { +- .sec_type = "b2889fc9e7d74f9da867af42e98be772", ++ .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772", + .decode = decode_hip08_pcie_local_error, + }, + }; +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index d1e1774..6ee9271 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -387,7 +387,7 @@ static int decode_hisi_common_section(struct ras_events *ras, + + static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { + { +- .sec_type = "c8b328a899174af69a132e08ab2e7586", ++ .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586", + .decode = decode_hisi_common_section, + }, + }; +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 6d5a6f8..6932e58 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -52,20 +52,6 @@ static char *uuid_le(const char *uu) + return uuid; + } + +-static int uuid_le_cmp(const char *sec_type, const char *uuid2) +-{ +- static char uuid1[32]; +- char *p = uuid1; +- int i; +- static const unsigned char le[16] = { +- 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; +- +- for (i = 0; i < 16; i++) +- p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); +- *p = 0; +- return strncmp(uuid1, uuid2, 32); +-} +- + int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) + { + struct ras_ns_ev_decoder *list; +@@ -96,7 +82,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p + + ns_ev_decoder = ras_ns_ev_dec_list; + while (ns_ev_decoder) { +- if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) { ++ if (strcmp(uuid_le(sec_type), ns_ev_decoder->sec_type) == 0) { + *p_ns_ev_dec = ns_ev_decoder; + match = 1; + break; +-- +2.25.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 62576b1..1c7fb5b 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 4 +Release: 5 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -23,18 +23,20 @@ Patch1: bugfix-rasdaemon-wait-for-file-access.patch Patch2: bugfix-fix-fd-check.patch Patch3: bugfix-fix-disk-error-log-storm.patch Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch -Patch5: 0001-Support-cpu-fault-isolation-for-corrected-errors.patch -Patch6: 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch +Patch5: 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch +Patch6: 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch Patch7: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch Patch8: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch Patch9: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch Patch10: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch -Patch11: 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch -Patch12: 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch -Patch13: 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch -Patch14: 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch -Patch15: 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch -Patch16: 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +Patch11: 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +Patch12: 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +Patch13: 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +Patch14: 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +Patch15: 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +Patch16: 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +Patch17: 0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch +Patch18: 0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch %description The rasdaemon program is a daemon which monitors the platform @@ -53,7 +55,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -80,6 +82,23 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Mon May 23 2022 Shiju Jose - 0.6.7-5 +- Type:feature +- ID:NA +- SUG:NA +- DESC: + Update with the latest patches for the + 1. CPU online fault isolation for arm event. + 2. Modify recording Hisilicon common error data in the rasdaemon + 3. In the ras-mc-ctl, + 3.1. Improve Hisilicon common error statistics. + 3.2. Add support to display the HiSilicon vendor-errors for a specified module. + 3.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options. + 3.4. Reformat error info of the HiSilicon Kunpeng920. + 3.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. + 3.6. Updated the HiSilicon platform name as KunPeng9xx. + 4. Fixed a memory out-of-bounds issue in the rasdaemon. + * Mon Mar 07 2022 Shiju Jose - 0.6.7-4 - Type:feature - ID:NA -- Gitee