diff --git a/0001-Support-cpu-fault-isolation-for-corrected-errors.patch b/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch similarity index 86% rename from 0001-Support-cpu-fault-isolation-for-corrected-errors.patch rename to 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch index d5460ded67e5b19966fea8ffa201cd6cbcd3d391..d17fb219aa2ea0c9f854ef889e2064511c981b27 100644 --- a/0001-Support-cpu-fault-isolation-for-corrected-errors.patch +++ b/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch @@ -1,34 +1,38 @@ -From a8e02e7d3d910eb7d049fd4126d53b8d3121d798 Mon Sep 17 00:00:00 2001 +From b9999d40d73dfff8b1cfb515f3b81b2c2891f6a7 Mon Sep 17 00:00:00 2001 From: Shengwei Luo Date: Wed, 23 Feb 2022 17:21:58 +0800 -Subject: [PATCH 1/2] Support cpu fault isolation for corrected errors +Subject: [PATCH 01/10] rasdaemon: Support cpu fault isolation for corrected + errors When the corrected errors exceed the set limit in cycle, try to offline the related cpu core. Signed-off-by: Shengwei Luo +Signed-off-by: Junchong Pan +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose --- Makefile.am | 6 +- configure.ac | 11 ++ misc/rasdaemon.env | 17 ++ - queue.c | 121 ++++++++++++++ + queue.c | 119 ++++++++++++++ queue.h | 39 +++++ - ras-arm-handler.c | 84 ++++++++++ - ras-arm-handler.h | 18 +++ - ras-cpu-isolation.c | 378 ++++++++++++++++++++++++++++++++++++++++++++ + ras-arm-handler.c | 97 +++++++++++ + ras-arm-handler.h | 18 ++ + ras-cpu-isolation.c | 388 ++++++++++++++++++++++++++++++++++++++++++++ ras-cpu-isolation.h | 68 ++++++++ ras-events.c | 9 +- - 10 files changed, 749 insertions(+), 2 deletions(-) + 10 files changed, 770 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c create mode 100644 ras-cpu-isolation.h diff --git a/Makefile.am b/Makefile.am -index fabca78..242ceb7 100644 +index a322b9a..36e7d4e 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -63,13 +63,17 @@ endif +@@ -69,13 +69,17 @@ endif if WITH_AMP_NS_DECODE rasdaemon_SOURCES += non-standard-ampere.c endif @@ -48,7 +52,7 @@ index fabca78..242ceb7 100644 # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac -index 33b81fe..d098fcf 100644 +index a77991f..e0ed751 100644 --- a/configure.ac +++ b/configure.ac @@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], @@ -102,10 +106,10 @@ index 12fd766..7cb18e8 100644 \ No newline at end of file diff --git a/queue.c b/queue.c new file mode 100644 -index 0000000..ed66798 +index 0000000..65b6fb8 --- /dev/null +++ b/queue.c -@@ -0,0 +1,121 @@ +@@ -0,0 +1,119 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -137,7 +141,6 @@ index 0000000..ed66798 + struct link_queue *queue = NULL; + + queue = (struct link_queue *)malloc(sizeof(struct link_queue)); -+ + if (queue == NULL) { + log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); + return NULL; @@ -218,7 +221,6 @@ index 0000000..ed66798 + struct queue_node *node = NULL; + + node = (struct queue_node *)malloc(sizeof(struct queue_node)); -+ + if (node != NULL) { + node->time = time; + node->value = value; @@ -273,7 +275,7 @@ index 0000000..5459f40 + +#endif diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 1149dc6..c9ef2fd 100644 +index 1149dc6..9c7a3c3 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -22,6 +22,10 @@ @@ -287,7 +289,7 @@ index 1149dc6..c9ef2fd 100644 void display_raw_data(struct trace_seq *s, const uint8_t *buf, -@@ -42,6 +46,44 @@ void display_raw_data(struct trace_seq *s, +@@ -42,6 +46,93 @@ void display_raw_data(struct trace_seq *s, } } @@ -327,18 +329,14 @@ index 1149dc6..c9ef2fd 100644 + log(TERM, LOG_INFO, "%d error in cpu core catched\n", num); + return num; +} -+#endif + - int ras_arm_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context) -@@ -139,6 +181,48 @@ int ras_arm_event_handler(struct trace_seq *s, - display_raw_data(s, ev.vsei_error, ev.oem_len); - #endif - -+#ifdef HAVE_CPU_FAULT_ISOLATION ++static int ras_handle_cpu_error(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, ++ struct ras_arm_event *ev, time_t now) ++{ ++ unsigned long long val; + int cpu; -+ int nums; + char *severity; + struct error_info err_info; + @@ -368,7 +366,8 @@ index 1149dc6..c9ef2fd 100644 + trace_seq_printf(s, "\n severity: %s", severity); + + if (val == GHES_SEV_CORRECTED) { -+ nums = count_errors(&ev); ++ int nums = count_errors(ev); ++ + if (nums > 0) { + err_info.nums = nums; + err_info.time = now; @@ -376,6 +375,29 @@ index 1149dc6..c9ef2fd 100644 + ras_record_cpu_error(&err_info, cpu); + } + } ++ ++ return 0; ++} ++#endif ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context) +@@ -52,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_arm_event ev; + int len = 0; ++ + memset(&ev, 0, sizeof(ev)); + + /* +@@ -139,6 +231,11 @@ int ras_arm_event_handler(struct trace_seq *s, + display_raw_data(s, ev.vsei_error, ev.oem_len); + #endif + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0) ++ return -1; +#endif + /* Insert data into the SGBD */ @@ -412,10 +434,10 @@ index 563a2d3..52813e7 100644 struct event_format *event, void *context); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..8c0cdf9 +index 0000000..abcf451 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,378 @@ +@@ -0,0 +1,388 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -441,6 +463,16 @@ index 0000000..8c0cdf9 +#include "ras-logger.h" +#include "ras-cpu-isolation.h" + ++#define SECOND_OF_MON (30 * 24 * 60 * 60) ++#define SECOND_OF_DAY (24 * 60 * 60) ++#define SECOND_OF_HOU (60 * 60) ++#define SECOND_OF_MIN (60) ++ ++#define LIMIT_OF_CPU_THRESHOLD 10000 ++#define INIT_OF_CPU_THRESHOLD 18 ++#define DEC_CHECK 10 ++#define LAST_BIT_OF_UL 5 ++ +static struct cpu_info *cpu_infos; +static unsigned int ncores; +static unsigned int enabled = 1; @@ -452,9 +484,9 @@ index 0000000..8c0cdf9 +}; + +static const struct param cycle_units[] = { -+ {"d", 24 * 60 * 60}, -+ {"h", 60 * 60}, -+ {"m", 60}, ++ {"d", SECOND_OF_DAY}, ++ {"h", SECOND_OF_HOU}, ++ {"m", SECOND_OF_MIN}, + {"s", 1}, + {} +}; @@ -462,8 +494,8 @@ index 0000000..8c0cdf9 +static struct isolation_param threshold = { + .name = "CPU_CE_THRESHOLD", + .units = normal_units, -+ .value = 18, -+ .limit = 10000 ++ .value = INIT_OF_CPU_THRESHOLD, ++ .limit = LIMIT_OF_CPU_THRESHOLD +}; + +static struct isolation_param cpu_limit = { @@ -474,8 +506,8 @@ index 0000000..8c0cdf9 +static struct isolation_param cycle = { + .name = "CPU_ISOLATION_CYCLE", + .units = cycle_units, -+ .value = 24 * 60 * 60, -+ .limit = 30 * 24 * 60 * 60 ++ .value = SECOND_OF_DAY, ++ .limit = SECOND_OF_MON +}; + +static const char * const cpu_state[] = { @@ -488,13 +520,17 @@ index 0000000..8c0cdf9 +static int open_sys_file(unsigned int cpu, int __oflag, const char *format) +{ + int fd; -+ char buf[MAX_PATH_LEN] = ""; -+ -+ snprintf(buf, sizeof(buf), format, cpu); -+ fd = open(buf, __oflag); ++ char path[MAX_PATH_LEN + 1] = ""; ++ char real_path[MAX_PATH_LEN + 1] = ""; + ++ snprintf(path, sizeof(path), format, cpu); ++ if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path); ++ return -1; ++ } ++ fd = open(real_path, __oflag); + if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, real_path); + return -1; + } + @@ -522,7 +558,6 @@ index 0000000..8c0cdf9 +{ + ncores = cpus; + cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); -+ + if (!cpu_infos) { + log(TERM, LOG_ERR, + "Failed to allocate memory for cpu infos in %s.\n", __func__); @@ -576,34 +611,33 @@ index 0000000..8c0cdf9 + + for (int i = 0; i < env_size; ++i) { + if (isdigit(env[i])) { -+ if (*value > ULONG_MAX / 10 || -+ (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { ++ if (*value > ULONG_MAX / DEC_CHECK || ++ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) { + log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); + return -1; + } -+ *value = 10 * (*value) + (env[i] - '0'); ++ *value = DEC_CHECK * (*value) + (env[i] - '0'); + } else + return -1; + } + -+ if (has_unit) { -+ for (const struct param *units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ if (*value > (ULONG_MAX / units->value)) { -+ log(TERM, LOG_ERR, -+ "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = (*value) * units->value; -+ return 0; ++ if (!has_unit) ++ return 0; ++ ++ for (const struct param *units = config->units; units->name; units++) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ if (*value > (ULONG_MAX / units->value)) { ++ log(TERM, LOG_ERR, ++ "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; + } ++ *value = (*value) * units->value; ++ return 0; + } -+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); -+ return -1; + } -+ -+ return 0; ++ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); ++ return -1; +} + +static void init_config(struct isolation_param *config) @@ -612,7 +646,7 @@ index 0000000..8c0cdf9 + unsigned long value = 0; + + if (parse_ul_config(config, env, &value) < 0) { -+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", ++ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n", + config->name, env, config->value); + return; + } @@ -667,9 +701,8 @@ index 0000000..8c0cdf9 + + strcpy(buf, "0"); + rc = write(fd, buf, strlen(buf)); -+ + if (rc < 0) { -+ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ log(TERM, LOG_ERR, "cpu%u offline failed, errno:%d\n", cpu, errno); + close(fd); + return HANDLE_FAILED; + } @@ -706,7 +739,7 @@ index 0000000..8c0cdf9 + + if (cpu_infos[cpu].ce_nums >= threshold.value) { + log(TERM, LOG_INFO, -+ "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", ++ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n", + threshold.value, cpu); + return do_cpu_offline(cpu); + } @@ -757,7 +790,7 @@ index 0000000..8c0cdf9 + + if (cpu >= ncores || cpu < 0) { + log(TERM, LOG_ERR, -+ "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); ++ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores); + return; + } + @@ -782,7 +815,6 @@ index 0000000..8c0cdf9 + } + + ret = error_handler(cpu, err_info); -+ + if (ret == HANDLE_NOTHING) + log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); + else if (ret == HANDLE_SUCCEED) { @@ -869,10 +901,10 @@ index 0000000..1159853 + +#endif diff --git a/ras-events.c b/ras-events.c -index ba769d1..491c17a 100644 +index 39cab20..beda655 100644 --- a/ras-events.c +++ b/ras-events.c -@@ -41,6 +41,7 @@ +@@ -42,6 +42,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-page-isolation.h" @@ -880,7 +912,7 @@ index ba769d1..491c17a 100644 /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -879,6 +880,10 @@ int handle_ras_events(int record_events) +@@ -856,6 +857,10 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); @@ -891,7 +923,7 @@ index ba769d1..491c17a 100644 #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -1005,6 +1010,8 @@ err: +@@ -982,6 +987,8 @@ err: } free(ras); } @@ -902,5 +934,5 @@ index ba769d1..491c17a 100644 return rc; } -- -2.27.0 +2.25.1 diff --git a/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch b/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch similarity index 69% rename from 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch rename to 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch index aa1b2517427417b6c9e98ad9cc1c6a8e7ec92a75..e401fa99e6f3dc0f7e2a021a14e4f8dc57643ae9 100644 --- a/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch +++ b/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch @@ -1,23 +1,35 @@ -From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001 +From fefa2d689f96302e64ad2375695703039e2ca951 Mon Sep 17 00:00:00 2001 From: Shengwei Luo Date: Wed, 23 Feb 2022 17:23:27 +0800 -Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors +Subject: [PATCH 02/10] rasdaemon: Support cpu fault isolation for recoverable + errors When the recoverable errors in cpu core occurred, try to offline the related cpu core. Signed-off-by: Shengwei Luo +Signed-off-by: Junchong Pan +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose --- - ras-arm-handler.c | 21 ++++++++++++++++++--- + ras-arm-handler.c | 22 +++++++++++++++++++--- ras-cpu-isolation.c | 17 +++++++++++++++++ ras-cpu-isolation.h | 4 +++- - 3 files changed, 38 insertions(+), 4 deletions(-) + 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index c9ef2fd..dae5ad6 100644 +index 9c7a3c3..a0dfc51 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s, +@@ -26,6 +26,7 @@ + + #define ARM_ERR_VALID_ERROR_COUNT BIT(0) + #define ARM_ERR_VALID_FLAGS BIT(1) ++#define BIT2 2 + + void display_raw_data(struct trace_seq *s, + const uint8_t *buf, +@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s, } #ifdef HAVE_CPU_FAULT_ISOLATION @@ -30,7 +42,7 @@ index c9ef2fd..dae5ad6 100644 + * Bit 0\1\3: (at lease 1) + * Bit 2: 0 + */ -+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2)); ++ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2)); + } + return 0; +} @@ -39,7 +51,7 @@ index c9ef2fd..dae5ad6 100644 { struct ras_arm_err_info *err_info; int num_pei; -@@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev) +@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev) */ error_count = err_info->multiple_error + 1; } @@ -48,22 +60,22 @@ index c9ef2fd..dae5ad6 100644 num += error_count; err_info += 1; -@@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s, } trace_seq_printf(s, "\n severity: %s", severity); - if (val == GHES_SEV_CORRECTED) { -- nums = count_errors(&ev); +- int nums = count_errors(ev); + if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { -+ nums = count_errors(&ev, val); ++ int nums = count_errors(ev, val); + if (nums > 0) { err_info.nums = nums; - err_info.time = now; diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index 8c0cdf9..e650022 100644 +index abcf451..fd23e4e 100644 --- a/ras-cpu-isolation.c +++ b/ras-cpu-isolation.c -@@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus) +@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus) for (unsigned int i = 0; i < cpus; ++i) { cpu_infos[i].ce_nums = 0; @@ -71,14 +83,14 @@ index 8c0cdf9..e650022 100644 cpu_infos[i].state = get_cpu_status(i); cpu_infos[i].ce_queue = init_queue(); -@@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu) +@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu) return HANDLE_NOTHING; } +static int do_uce_handler(unsigned int cpu) +{ + if (cpu_infos[cpu].uce_nums > 0) { -+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu); ++ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu); + return do_cpu_offline(cpu); + } + return HANDLE_NOTHING; @@ -87,7 +99,7 @@ index 8c0cdf9..e650022 100644 static int error_handler(unsigned int cpu, struct error_info *err_info) { int ret = HANDLE_NOTHING; -@@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) +@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) case CE: ret = do_ce_handler(cpu); break; @@ -97,7 +109,7 @@ index 8c0cdf9..e650022 100644 default: break; } -@@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) +@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) cpu_infos[cpu].ce_nums += err_info->nums; break; } @@ -107,7 +119,7 @@ index 8c0cdf9..e650022 100644 default: break; } -@@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) +@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) cpu, cpu_state[cpu_infos[cpu].state]); clear_queue(cpu_infos[cpu].ce_queue); cpu_infos[cpu].ce_nums = 0; @@ -134,5 +146,5 @@ index 1159853..024a68b 100644 struct link_queue *ce_queue; enum cpu_state state; -- -2.27.0 +2.25.1 diff --git a/0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch b/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch similarity index 93% rename from 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch rename to 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch index d15a7142eb2cf604a988c46217c33253d7143f54..c51e35a16f2335b969b642c59fdf7165eaf987f2 100644 --- a/0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +++ b/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch @@ -1,7 +1,7 @@ -From 62218a9c3aec44330ce3b77f3634c788b6e6f60c Mon Sep 17 00:00:00 2001 +From 9c4665f33c39ea84db7d69079ab27205d2fbd07e Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 2 Mar 2022 12:20:40 +0000 -Subject: [PATCH 1/6] rasdaemon: Modify recording Hisilicon common error data +Subject: [PATCH 03/10] rasdaemon: Modify recording Hisilicon common error data The error statistics for the Hisilicon common error need to do based on module, error severity etc. @@ -11,11 +11,11 @@ in the sql db table instead of the combined single field. Signed-off-by: Shiju Jose --- - non-standard-hisilicon.c | 122 ++++++++++++++++++++++++++++++++------- - 1 file changed, 102 insertions(+), 20 deletions(-) + non-standard-hisilicon.c | 126 ++++++++++++++++++++++++++++++++------- + 1 file changed, 104 insertions(+), 22 deletions(-) diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 1432163..dc69d46 100644 +index 1432163..d1e1774 100644 --- a/non-standard-hisilicon.c +++ b/non-standard-hisilicon.c @@ -17,6 +17,7 @@ @@ -53,11 +53,15 @@ index 1432163..dc69d46 100644 char reg_msg[HISI_BUF_LEN]; }; -@@ -134,12 +148,24 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) +@@ -132,14 +146,26 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) + + #ifdef HAVE_SQLITE3 static const struct db_fields hisi_common_section_fields[] = { - { .name = "id", .type = "INTEGER PRIMARY KEY" }, - { .name = "timestamp", .type = "TEXT" }, +- { .name = "id", .type = "INTEGER PRIMARY KEY" }, +- { .name = "timestamp", .type = "TEXT" }, - { .name = "err_info", .type = "TEXT" }, ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, + { .name = "version", .type = "INTEGER" }, + { .name = "soc_id", .type = "INTEGER" }, + { .name = "socket_id", .type = "INTEGER" }, diff --git a/0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch b/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch similarity index 90% rename from 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch rename to 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch index 7f7eb2406fa143e08079930c2316c1e55168f866..8963d91b230e9059c02469875b4548a9541ec085 100644 --- a/0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +++ b/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch @@ -1,10 +1,10 @@ -From 4d9f297028ce3116eaf574b2570d71a4ed666b7d Mon Sep 17 00:00:00 2001 +From 4f706ff3b1a04de3be506a309e153b99e04b3445 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Thu, 24 Feb 2022 18:02:14 +0000 -Subject: [PATCH 2/6] rasdaemon: ras-mc-ctl: Modify error statistics for - HiSilicon Kunpeng9xx common errors +Subject: [PATCH 04/10] rasdaemon: ras-mc-ctl: Modify error statistics for + HiSilicon KunPeng9xx common errors -Modify the error statistics for the HiSilicon Kunpeng9xx platforms common errors +Modify the error statistics for the HiSilicon KunPeng9xx platforms common errors to display the statistics and error info based on the module and the error severity. Signed-off-by: Shiju Jose @@ -13,10 +13,10 @@ Signed-off-by: Shiju Jose 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 1e3aeb7..22ba1fd 100755 +index b22dd60..08eb287 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1535,7 +1535,7 @@ sub vendor_errors_summary +@@ -1537,7 +1537,7 @@ sub vendor_errors_summary require DBI; my ($num_args, $platform_id); my ($query, $query_handle, $count, $out); @@ -25,7 +25,7 @@ index 1e3aeb7..22ba1fd 100755 $num_args = $#ARGV + 1; $platform_id = 0; -@@ -1612,13 +1612,18 @@ sub vendor_errors_summary +@@ -1614,13 +1614,18 @@ sub vendor_errors_summary # HiSilicon Kunpeng9xx common errors if ($platform_id eq HISILICON_KUNPENG_9XX) { @@ -47,7 +47,7 @@ index 1e3aeb7..22ba1fd 100755 } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -@@ -1636,8 +1641,8 @@ sub vendor_errors +@@ -1638,8 +1643,8 @@ sub vendor_errors require DBI; my ($num_args, $platform_id); my ($query, $query_handle, $id, $timestamp, $out); @@ -58,7 +58,7 @@ index 1e3aeb7..22ba1fd 100755 $num_args = $#ARGV + 1; $platform_id = 0; -@@ -1725,15 +1730,28 @@ sub vendor_errors +@@ -1727,15 +1732,28 @@ sub vendor_errors # HiSilicon Kunpeng9xx common errors if ($platform_id eq HISILICON_KUNPENG_9XX) { diff --git a/0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch b/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch similarity index 88% rename from 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch rename to 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch index 7600b58c6cf8e95ca2d48c56ae51da7bc6cdabb3..2ff9537371c838036014fb04bfc7900ace448181 100644 --- a/0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +++ b/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch @@ -1,7 +1,7 @@ -From eb93d77b417b58cba27799ae85747b8a193cf063 Mon Sep 17 00:00:00 2001 +From f5c3c03039be28bb6b5bbe00e12e9586b19a1060 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 5 Mar 2022 16:18:55 +0000 -Subject: [PATCH 3/6] rasdaemon: ras-mc-ctl: Reformat error info of the +Subject: [PATCH 05/10] rasdaemon: ras-mc-ctl: Reformat error info of the HiSilicon Kunpeng920 Reformat the code to display the error info of HiSilicon Kunpeng920. @@ -12,10 +12,10 @@ Signed-off-by: Shiju Jose 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 22ba1fd..eeaf885 100755 +index 08eb287..8755b6f 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1669,8 +1669,9 @@ sub vendor_errors +@@ -1671,8 +1671,9 @@ sub vendor_errors $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); $out .= "module_id=$module_id, " if ($module_id); $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); @@ -27,7 +27,7 @@ index 22ba1fd..eeaf885 100755 } if ($out ne "") { print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -@@ -1692,8 +1693,9 @@ sub vendor_errors +@@ -1694,8 +1695,9 @@ sub vendor_errors $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); $out .= "module_id=$module_id, " if ($module_id); $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); @@ -39,7 +39,7 @@ index 22ba1fd..eeaf885 100755 } if ($out ne "") { print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -@@ -1717,8 +1719,9 @@ sub vendor_errors +@@ -1719,8 +1721,9 @@ sub vendor_errors $out .= "core_id=$core_id, " if ($core_id); $out .= "port_id=$port_id, " if ($port_id); $out .= "err_severity=$err_severity, " if ($err_severity); diff --git a/0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch b/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch similarity index 55% rename from 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch rename to 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch index 15ab710dc8bb21c6b4c44f245319eb52af1301cb..1ff38e399c290adff15b3c29a499cbcf76d16bf7 100644 --- a/0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +++ b/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch @@ -1,10 +1,11 @@ -From 623e85c07ab21ccc89ffe2bb444eb000a2664a9d Mon Sep 17 00:00:00 2001 +From d595a9d61f9d8341a5e30d4d800e3237d6e0f390 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 5 Mar 2022 17:01:35 +0000 -Subject: [PATCH 4/6] rasdaemon: ras-mc-ctl: Add printing usage if necessary - parameters are not passed for the HiSilicon vendor-errors options +Subject: [PATCH 06/10] rasdaemon: ras-mc-ctl: Add printing usage if necessary + parameters are not passed for the vendor-error options -Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options of the ras-mc-ctl. +Add printing usage if necessary parameters are not passed +for the vendor-errors options. Signed-off-by: Shiju Jose --- @@ -12,10 +13,10 @@ Signed-off-by: Shiju Jose 1 file changed, 2 insertions(+) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index eeaf885..0e32cb1 100755 +index 8755b6f..959ea6b 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1542,6 +1542,7 @@ sub vendor_errors_summary +@@ -1544,6 +1544,7 @@ sub vendor_errors_summary if ($num_args ne 0) { $platform_id = $ARGV[0]; } else { @@ -23,7 +24,7 @@ index eeaf885..0e32cb1 100755 return; } -@@ -1649,6 +1650,7 @@ sub vendor_errors +@@ -1651,6 +1652,7 @@ sub vendor_errors if ($num_args ne 0) { $platform_id = $ARGV[0]; } else { diff --git a/0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch b/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch similarity index 70% rename from 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch rename to 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch index 6153a85717c21b50da036c97cff54d5e3c963b27..6af2ad06985067634973a73ff66663a0619489a7 100644 --- a/0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +++ b/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch @@ -1,52 +1,105 @@ -From 4007c95f8a8d570542ffc11676b619ea5649d0e7 Mon Sep 17 00:00:00 2001 +From 0643011831e5fb4e81edff16ad55f9a5196ec7a9 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 5 Mar 2022 18:19:38 +0000 -Subject: [PATCH 5/6] rasdaemon: ras-mc-ctl: Add support to display the +Subject: [PATCH 07/10] rasdaemon: ras-mc-ctl: Add support to display the HiSilicon vendor errors for a specified module Add support to display the HiSilicon vendor errors for a specified module. Signed-off-by: Shiju Jose --- - util/ras-mc-ctl.in | 119 ++++++++++++++++++++++++--------------------- - 1 file changed, 63 insertions(+), 56 deletions(-) + util/ras-mc-ctl.in | 145 +++++++++++++++++++++++++++------------------ + 1 file changed, 87 insertions(+), 58 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 0e32cb1..d728300 100755 +index 959ea6b..296eb87 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -96,7 +96,8 @@ Usage: $prog [OPTIONS...] +@@ -96,8 +96,9 @@ Usage: $prog [OPTIONS...] --errors Shows the errors stored at the error database. --error-count Shows the corrected and uncorrected error counts using sysfs. --vendor-errors-summary Presents a summary of the vendor-specific logged errors. - --vendor-errors Shows the vendor-specific errors stored in the error database. +- --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. + --vendor-errors Shows the vendor-specific errors stored in the error database. + --vendor-errors Shows the vendor-specific errors for a specific module stored in the error database. - --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. ++ --vendor-platforms List the supported platforms with platform-ids for the vendor-specific errors. --help This help message. EOF -@@ -1640,15 +1641,19 @@ sub vendor_errors_summary + +@@ -1535,12 +1536,14 @@ use constant { + sub vendor_errors_summary + { + require DBI; +- my ($num_args, $platform_id); ++ my ($num_args, $platform_id, $found_platform); + my ($query, $query_handle, $count, $out); + my ($module_id, $sub_module_id, $err_severity, $err_sev); + + $num_args = $#ARGV + 1; + $platform_id = 0; ++ $found_platform = 0; ++ + if ($num_args ne 0) { + $platform_id = $ARGV[0]; + } else { +@@ -1552,6 +1555,7 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng920 errors + if ($platform_id eq HISILICON_KUNPENG_920) { ++ $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1615,6 +1619,7 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { ++ $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1636,21 +1641,31 @@ sub vendor_errors_summary + $query_handle->finish; + } + ++ if ($platform_id && !($found_platform)) { ++ print "Platform ID $platform_id is not valid\n"; ++ } ++ + undef($dbh); + } + sub vendor_errors { require DBI; - my ($num_args, $platform_id); -+ my ($num_args, $platform_id, $module); ++ my ($num_args, $platform_id, $found_platform, $module, $found_module); my ($query, $query_handle, $id, $timestamp, $out); my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); $num_args = $#ARGV + 1; $platform_id = 0; ++ $found_platform = 0; + $module = 0; ++ $found_module = 0; if ($num_args ne 0) { $platform_id = $ARGV[0]; + if ($num_args gt 1) { + $module = $ARGV[1]; -+ } ++ } } else { usage(1); return; -@@ -1664,21 +1669,21 @@ sub vendor_errors +@@ -1660,27 +1675,29 @@ sub vendor_errors + + # HiSilicon Kunpeng920 errors + if ($platform_id eq HISILICON_KUNPENG_920) { ++ $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); $out = ""; while($query_handle->fetch()) { @@ -60,7 +113,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_severity=$err_severity, " if ($err_severity); - $out .= "Error Registers: $regs " if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -71,6 +124,7 @@ index 0e32cb1..d728300 100755 + $out .= "err_severity=$err_severity, " if ($err_severity); + $out .= "Error Registers: $regs " if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { @@ -80,7 +134,7 @@ index 0e32cb1..d728300 100755 } $query_handle->finish; -@@ -1688,21 +1693,21 @@ sub vendor_errors +@@ -1690,21 +1707,22 @@ sub vendor_errors $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); $out = ""; while($query_handle->fetch()) { @@ -94,7 +148,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_severity=$err_severity, " if ($err_severity); - $out .= "Error Registers: $regs " if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -105,6 +159,7 @@ index 0e32cb1..d728300 100755 + $out .= "err_severity=$err_severity, " if ($err_severity); + $out .= "Error Registers: $regs " if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { @@ -114,7 +169,7 @@ index 0e32cb1..d728300 100755 } $query_handle->finish; -@@ -1712,23 +1717,23 @@ sub vendor_errors +@@ -1714,51 +1732,56 @@ sub vendor_errors $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs)); $out = ""; while($query_handle->fetch()) { @@ -130,7 +185,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_type=$err_type, " if ($err_type); - $out .= "Error Registers: $regs " if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($sub_module_id && ($module eq $sub_module_id))) { ++ if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -143,6 +198,7 @@ index 0e32cb1..d728300 100755 + $out .= "err_type=$err_type, " if ($err_type); + $out .= "Error Registers: $regs " if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { @@ -152,7 +208,13 @@ index 0e32cb1..d728300 100755 } $query_handle->finish; } -@@ -1741,22 +1746,24 @@ sub vendor_errors + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { ++ $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); $out = ""; while($query_handle->fetch()) { @@ -172,7 +234,7 @@ index 0e32cb1..d728300 100755 - $out .= "err_severity=$err_severity, " if ($err_severity); - $out .= "Error Registers: $regs" if ($regs); - $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { + $out .= "$id. $timestamp Error Info: "; + $out .= "version=$version, "; + $out .= "soc_id=$soc_id, " if ($soc_id); @@ -189,10 +251,24 @@ index 0e32cb1..d728300 100755 + $out .= "err_severity=$err_severity, " if ($err_severity); + $out .= "Error Registers: $regs" if ($regs); + $out .= "\n\n"; ++ $found_module = 1; + } } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +@@ -1768,6 +1791,12 @@ sub vendor_errors + $query_handle->finish; + } + ++ if ($platform_id && !($found_platform)) { ++ print "Platform ID $platform_id is not valid\n"; ++ } elsif ($module && !($found_module)) { ++ print "No error record for the module $module\n"; ++ } ++ + undef($dbh); + } + -- 2.25.1 diff --git a/0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch b/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch similarity index 83% rename from 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch rename to 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch index 073d33562f1dd3c1d619549f85823fe906b5e788..0453e046cc97e34d6e0bff78d16b06fff2c21402 100644 --- a/0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +++ b/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch @@ -1,21 +1,21 @@ -From 88bf3126312645843152c6c3215b54b120bcc1ec Mon Sep 17 00:00:00 2001 +From 2f23b5dc6e5831c8ef2e179bb936e13502f75041 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Mon, 7 Mar 2022 12:38:45 +0000 -Subject: [PATCH 6/6] rasdaemon: ras-mc-ctl: Relocate reading and display +Subject: [PATCH 08/10] rasdaemon: ras-mc-ctl: Relocate reading and display Kunpeng920 errors to under Kunpeng9xx Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. Signed-off-by: Shiju Jose --- - util/ras-mc-ctl.in | 38 ++++++++++---------------------------- - 1 file changed, 10 insertions(+), 28 deletions(-) + util/ras-mc-ctl.in | 40 ++++++++++------------------------------ + 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index d728300..2ab9602 100755 +index 296eb87..75981a0 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in -@@ -1527,7 +1527,6 @@ sub errors +@@ -1529,7 +1529,6 @@ sub errors # Definitions of the vendor platform IDs. use constant { @@ -23,18 +23,18 @@ index d728300..2ab9602 100755 HISILICON_KUNPENG_9XX => "Kunpeng9xx", }; -@@ -1549,8 +1548,8 @@ sub vendor_errors_summary +@@ -1553,8 +1552,8 @@ sub vendor_errors_summary my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx common errors ++ # HiSilicon Kunpeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1565,9 +1564,7 @@ sub vendor_errors_summary +@@ -1570,9 +1569,7 @@ sub vendor_errors_summary $out .= "\t$module_id: $count\n"; } if ($out ne "") { @@ -45,7 +45,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1585,9 +1582,7 @@ sub vendor_errors_summary +@@ -1590,9 +1587,7 @@ sub vendor_errors_summary $out .= "\t$module_id: $count\n"; } if ($out ne "") { @@ -56,7 +56,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1605,15 +1600,10 @@ sub vendor_errors_summary +@@ -1610,16 +1605,10 @@ sub vendor_errors_summary $out .= "\t$sub_module_id: $count\n"; } if ($out ne "") { @@ -70,10 +70,11 @@ index d728300..2ab9602 100755 - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $found_platform = 1; $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; $query_handle = $dbh->prepare($query); $query_handle->execute(); -@@ -1629,8 +1619,6 @@ sub vendor_errors_summary +@@ -1635,8 +1624,6 @@ sub vendor_errors_summary } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; @@ -82,18 +83,18 @@ index d728300..2ab9602 100755 } $query_handle->finish; } -@@ -1661,8 +1649,8 @@ sub vendor_errors +@@ -1673,8 +1660,8 @@ sub vendor_errors my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx common errors ++ # HiSilicon Kunpeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1683,7 +1671,7 @@ sub vendor_errors +@@ -1697,7 +1684,7 @@ sub vendor_errors } } if ($out ne "") { @@ -102,7 +103,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1707,7 +1695,7 @@ sub vendor_errors +@@ -1722,7 +1709,7 @@ sub vendor_errors } } if ($out ne "") { @@ -111,7 +112,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; -@@ -1733,13 +1721,10 @@ sub vendor_errors +@@ -1749,14 +1736,10 @@ sub vendor_errors } } if ($out ne "") { @@ -123,10 +124,11 @@ index d728300..2ab9602 100755 - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $found_platform = 1; $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; $query_handle = $dbh->prepare($query); $query_handle->execute(); -@@ -1767,8 +1752,6 @@ sub vendor_errors +@@ -1785,8 +1768,6 @@ sub vendor_errors } if ($out ne "") { print "HiSilicon Kunpeng9xx common error events:\n$out\n"; @@ -135,7 +137,7 @@ index d728300..2ab9602 100755 } $query_handle->finish; } -@@ -1779,7 +1762,6 @@ sub vendor_errors +@@ -1803,7 +1784,6 @@ sub vendor_errors sub vendor_platforms { print "\nSupported platforms for the vendor-specific errors:\n"; diff --git a/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch b/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch new file mode 100644 index 0000000000000000000000000000000000000000..e34f89f21db44556fb17a663c653c37b7ad4e271 --- /dev/null +++ b/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch @@ -0,0 +1,127 @@ +From df6011fed2bb45989f9e5c2ea30b33937b08d06c Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 28 Apr 2022 18:58:43 +0100 +Subject: [PATCH 09/10] rasdaemon: ras-mc-ctl: Updated HiSilicon platform name + +Updated the HiSilicon platform name as KunPeng9xx. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 75981a0..1cc19b3 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1529,7 +1529,7 @@ sub errors + + # Definitions of the vendor platform IDs. + use constant { +- HISILICON_KUNPENG_9XX => "Kunpeng9xx", ++ HISILICON_KUNPENG_9XX => "KunPeng9xx", + }; + + sub vendor_errors_summary +@@ -1552,7 +1552,7 @@ sub vendor_errors_summary + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng9xx errors ++ # HiSilicon KunPeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; +@@ -1569,7 +1569,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type1 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1587,7 +1587,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type2 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1605,7 +1605,7 @@ sub vendor_errors_summary + $out .= "\t$sub_module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx PCIe controller error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1623,7 +1623,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx common error events summary:\n$out\n"; + } + $query_handle->finish; + } +@@ -1660,7 +1660,7 @@ sub vendor_errors + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng9xx errors ++ # HiSilicon KunPeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; +@@ -1684,7 +1684,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type1 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1709,7 +1709,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type2 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1736,7 +1736,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx PCIe controller error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1767,7 +1767,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx common error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx common error events:\n$out\n"; + } + $query_handle->finish; + } +@@ -1784,7 +1784,7 @@ sub vendor_errors + sub vendor_platforms + { + print "\nSupported platforms for the vendor-specific errors:\n"; +- print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; ++ print "\tHiSilicon KunPeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; + print "\n"; + } + +-- +2.25.1 + diff --git a/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch b/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch new file mode 100644 index 0000000000000000000000000000000000000000..48a62cc0ba5d90bb57add7ed778326d5f91d10d8 --- /dev/null +++ b/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch @@ -0,0 +1,90 @@ +From c019f2f82b7f224e95968037f2afc16f63cc1d1d Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 28 Apr 2022 22:59:04 +0100 +Subject: [PATCH 10/10] rasdaemon: Fix for a memory out-of-bounds issue and + optimized code to remove duplicate function. + +Fixed a memory out-of-bounds issue with string pointers and +optimized code structure to remove duplicate function. + +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose +--- + non-standard-hisi_hip08.c | 6 +++--- + non-standard-hisilicon.c | 2 +- + ras-non-standard-handler.c | 16 +--------------- + 3 files changed, 5 insertions(+), 19 deletions(-) + +diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c +index 9092183..4ef47ea 100644 +--- a/non-standard-hisi_hip08.c ++++ b/non-standard-hisi_hip08.c +@@ -1014,15 +1014,15 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, + + static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { + { +- .sec_type = "1f8161e155d641e6bd107afd1dc5f7c5", ++ .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5", + .decode = decode_hip08_oem_type1_error, + }, + { +- .sec_type = "45534ea6ce2341158535e07ab3aef91d", ++ .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d", + .decode = decode_hip08_oem_type2_error, + }, + { +- .sec_type = "b2889fc9e7d74f9da867af42e98be772", ++ .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772", + .decode = decode_hip08_pcie_local_error, + }, + }; +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index d1e1774..6ee9271 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -387,7 +387,7 @@ static int decode_hisi_common_section(struct ras_events *ras, + + static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { + { +- .sec_type = "c8b328a899174af69a132e08ab2e7586", ++ .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586", + .decode = decode_hisi_common_section, + }, + }; +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 6d5a6f8..6932e58 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -52,20 +52,6 @@ static char *uuid_le(const char *uu) + return uuid; + } + +-static int uuid_le_cmp(const char *sec_type, const char *uuid2) +-{ +- static char uuid1[32]; +- char *p = uuid1; +- int i; +- static const unsigned char le[16] = { +- 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; +- +- for (i = 0; i < 16; i++) +- p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); +- *p = 0; +- return strncmp(uuid1, uuid2, 32); +-} +- + int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) + { + struct ras_ns_ev_decoder *list; +@@ -96,7 +82,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p + + ns_ev_decoder = ras_ns_ev_dec_list; + while (ns_ev_decoder) { +- if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) { ++ if (strcmp(uuid_le(sec_type), ns_ev_decoder->sec_type) == 0) { + *p_ns_ev_dec = ns_ev_decoder; + match = 1; + break; +-- +2.25.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 62576b1acbab1c17be47be2bafb57795f58e74c8..1c7fb5b6c88ed9462ef4789120a1e1af2cede0d2 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 4 +Release: 5 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -23,18 +23,20 @@ Patch1: bugfix-rasdaemon-wait-for-file-access.patch Patch2: bugfix-fix-fd-check.patch Patch3: bugfix-fix-disk-error-log-storm.patch Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch -Patch5: 0001-Support-cpu-fault-isolation-for-corrected-errors.patch -Patch6: 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch +Patch5: 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch +Patch6: 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch Patch7: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch Patch8: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch Patch9: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch Patch10: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch -Patch11: 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch -Patch12: 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch -Patch13: 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch -Patch14: 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch -Patch15: 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch -Patch16: 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +Patch11: 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +Patch12: 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +Patch13: 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +Patch14: 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +Patch15: 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +Patch16: 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +Patch17: 0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch +Patch18: 0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch %description The rasdaemon program is a daemon which monitors the platform @@ -53,7 +55,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -80,6 +82,23 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Mon May 23 2022 Shiju Jose - 0.6.7-5 +- Type:feature +- ID:NA +- SUG:NA +- DESC: + Update with the latest patches for the + 1. CPU online fault isolation for arm event. + 2. Modify recording Hisilicon common error data in the rasdaemon + 3. In the ras-mc-ctl, + 3.1. Improve Hisilicon common error statistics. + 3.2. Add support to display the HiSilicon vendor-errors for a specified module. + 3.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options. + 3.4. Reformat error info of the HiSilicon Kunpeng920. + 3.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. + 3.6. Updated the HiSilicon platform name as KunPeng9xx. + 4. Fixed a memory out-of-bounds issue in the rasdaemon. + * Mon Mar 07 2022 Shiju Jose - 0.6.7-4 - Type:feature - ID:NA