From d594e333f41c0681240b88030f10ce3e3f197d7f Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Mon, 28 Mar 2022 16:19:44 +0800 Subject: [PATCH] sync master aand 22.03 --- ...fault-isolation-for-corrected-errors.patch | 1086 ++++++++--------- ...-recording-Hisilicon-common-error-da.patch | 224 ++++ ...ult-isolation-for-recoverable-errors.patch | 138 +++ ...-ctl-Modify-error-statistics-for-HiS.patch | 97 ++ ...-ctl-Reformat-error-info-of-the-HiSi.patch | 56 + ...-ctl-Add-printing-usage-if-necessary.patch | 36 + ...-ctl-Add-support-to-display-the-HiSi.patch | 198 +++ ...-ctl-Relocate-reading-and-display-Ku.patch | 148 +++ ...add-trace-print-and-add-sqlite-store.patch | 78 -- ...rse-for-adapting-to-new-bios-version.patch | 60 - rasdaemon.spec | 73 +- 11 files changed, 1426 insertions(+), 768 deletions(-) rename 0006-add-cpu-online-fault-isolation.patch => 0001-Support-cpu-fault-isolation-for-corrected-errors.patch (36%) create mode 100644 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch create mode 100644 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch create mode 100644 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch create mode 100644 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch create mode 100644 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch create mode 100644 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch create mode 100644 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch delete mode 100644 0007-add-trace-print-and-add-sqlite-store.patch delete mode 100644 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch diff --git a/0006-add-cpu-online-fault-isolation.patch b/0001-Support-cpu-fault-isolation-for-corrected-errors.patch similarity index 36% rename from 0006-add-cpu-online-fault-isolation.patch rename to 0001-Support-cpu-fault-isolation-for-corrected-errors.patch index 6228b3f..d5460de 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0001-Support-cpu-fault-isolation-for-corrected-errors.patch @@ -1,24 +1,24 @@ -From 9e2d3f84c4f158dd58bce4a30eec568331749501 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Tue, 25 May 2021 20:05:49 +0800 -Subject: [PATCH] add cpu online fault isolation +From a8e02e7d3d910eb7d049fd4126d53b8d3121d798 Mon Sep 17 00:00:00 2001 +From: Shengwei Luo +Date: Wed, 23 Feb 2022 17:21:58 +0800 +Subject: [PATCH 1/2] Support cpu fault isolation for corrected errors -Add cpu online fault isolation, when CE/UCE occurs, we choose to offline -the error cpu according to threshold algorithm. +When the corrected errors exceed the set limit in cycle, try to +offline the related cpu core. -Signed-off-by: Luo Shengwei +Signed-off-by: Shengwei Luo --- Makefile.am | 6 +- - configure.ac | 11 + + configure.ac | 11 ++ misc/rasdaemon.env | 17 ++ - queue.c | 126 +++++++++++ - queue.h | 43 ++++ - ras-arm-handler.c | 73 +++++++ - ras-cpu-isolation.c | 499 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 76 +++++++ - ras-events.c | 8 + - ras-record.h | 5 + - 10 files changed, 863 insertions(+), 1 deletion(-) + queue.c | 121 ++++++++++++++ + queue.h | 39 +++++ + ras-arm-handler.c | 84 ++++++++++ + ras-arm-handler.h | 18 +++ + ras-cpu-isolation.c | 378 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 68 ++++++++ + ras-events.c | 9 +- + 10 files changed, 749 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c @@ -75,7 +75,7 @@ index 33b81fe..d098fcf 100644 + CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..3191d03 100644 +index 12fd766..7cb18e8 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env @@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" @@ -99,12 +99,13 @@ index 12fd766..3191d03 100644 + +# Prevent excessive isolation from causing an avalanche effect +CPU_ISOLATION_LIMIT="10" +\ No newline at end of file diff --git a/queue.c b/queue.c new file mode 100644 -index 0000000..92f3d3c +index 0000000..ed66798 --- /dev/null +++ b/queue.c -@@ -0,0 +1,126 @@ +@@ -0,0 +1,121 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -117,126 +118,121 @@ index 0000000..92f3d3c + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ +#include +#include +#include "queue.h" +#include "ras-logger.h" + -+ +int is_empty(struct link_queue *queue) +{ -+ if (queue) { -+ return queue->size == 0; -+ } ++ if (queue) ++ return queue->size == 0; + -+ return 1; ++ return 1; +} + -+struct link_queue* init_queue(void) ++struct link_queue *init_queue(void) +{ -+ struct link_queue* queue; -+ queue = (struct link_queue*) malloc(sizeof(struct link_queue)); ++ struct link_queue *queue = NULL; + -+ if (queue == NULL) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); -+ return NULL; -+ } ++ queue = (struct link_queue *)malloc(sizeof(struct link_queue)); ++ ++ if (queue == NULL) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); ++ return NULL; ++ } + -+ queue->size = 0; -+ queue->head = NULL; -+ queue->tail = NULL; ++ queue->size = 0; ++ queue->head = NULL; ++ queue->tail = NULL; + -+ return queue; ++ return queue; +} + +void clear_queue(struct link_queue *queue) +{ -+ if (queue == NULL) { -+ return; -+ } -+ -+ struct queue_node *node = queue->head; -+ struct queue_node *tmp = NULL; -+ -+ while (node != NULL) { -+ tmp = node; -+ node = node->next; -+ free(tmp); -+ } -+ -+ queue->head = NULL; -+ queue->tail = NULL; -+ queue->size = 0; ++ if (queue == NULL) ++ return; ++ ++ struct queue_node *node = queue->head; ++ struct queue_node *tmp = NULL; ++ ++ while (node != NULL) { ++ tmp = node; ++ node = node->next; ++ free(tmp); ++ } ++ ++ queue->head = NULL; ++ queue->tail = NULL; ++ queue->size = 0; +} + -+void free_queue(struct link_queue *queue) { -+ clear_queue(queue); ++void free_queue(struct link_queue *queue) ++{ ++ clear_queue(queue); + -+ if (queue) { -+ free(queue); -+ } ++ if (queue) ++ free(queue); +} + +/* It should be guranteed that the param is not NULL */ +void push(struct link_queue *queue, struct queue_node *node) +{ -+ /* there is no element in the queue */ -+ if (queue->head == NULL) { -+ queue->head = node; -+ } -+ else { -+ node->next = queue->tail->next; -+ queue->tail->next = node; -+ } -+ -+ queue->tail = node; -+ (queue->size)++; ++ /* there is no element in the queue */ ++ if (queue->head == NULL) ++ queue->head = node; ++ else ++ queue->tail->next = node; ++ ++ queue->tail = node; ++ (queue->size)++; +} + +int pop(struct link_queue *queue) +{ -+ if (queue == NULL || is_empty(queue)) { -+ return -1; -+ } ++ struct queue_node *tmp = NULL; ++ ++ if (queue == NULL || is_empty(queue)) ++ return -1; + -+ struct queue_node *tmp = NULL; -+ tmp = queue->head; -+ queue->head = queue->head->next; -+ free(tmp); -+ (queue->size)--; ++ tmp = queue->head; ++ queue->head = queue->head->next; ++ free(tmp); ++ (queue->size)--; + -+ return 0; ++ return 0; +} + -+struct queue_node* front(struct link_queue *queue) ++struct queue_node *front(struct link_queue *queue) +{ -+ if (queue == NULL) { -+ return NULL; -+ } ++ if (queue == NULL) ++ return NULL; + -+ return queue->head; ++ return queue->head; +} + -+struct queue_node* node_create(time_t time, unsigned value) ++struct queue_node *node_create(time_t time, unsigned int value) +{ -+ struct queue_node *node = NULL; -+ node = (struct queue_node*) malloc(sizeof(struct queue_node)); ++ struct queue_node *node = NULL; + -+ if (node != NULL) { -+ node->time = time; -+ node->value = value; -+ node->next = NULL; -+ } ++ node = (struct queue_node *)malloc(sizeof(struct queue_node)); + -+ return node; ++ if (node != NULL) { ++ node->time = time; ++ node->value = value; ++ node->next = NULL; ++ } ++ ++ return node; +} diff --git a/queue.h b/queue.h new file mode 100644 -index 0000000..9684c58 +index 0000000..5459f40 --- /dev/null +++ b/queue.h -@@ -0,0 +1,43 @@ +@@ -0,0 +1,39 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -249,122 +245,135 @@ index 0000000..9684c58 + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ + +#ifndef __RAS_QUEUE_H +#define __RAS_QUEUE_H + -+ -+struct queue_node -+{ -+ time_t time; -+ unsigned value; -+ struct queue_node *next; ++struct queue_node { ++ time_t time; ++ unsigned int value; ++ struct queue_node *next; +}; + -+struct link_queue -+{ -+ struct queue_node *head; -+ struct queue_node *tail; -+ int size; ++struct link_queue { ++ struct queue_node *head; ++ struct queue_node *tail; ++ int size; +}; + +int is_empty(struct link_queue *queue); -+struct link_queue* init_queue(void); ++struct link_queue *init_queue(void); +void clear_queue(struct link_queue *queue); +void free_queue(struct link_queue *queue); +void push(struct link_queue *queue, struct queue_node *node); +int pop(struct link_queue *queue); -+struct queue_node* front(struct link_queue *queue); -+struct queue_node* node_create(time_t time, unsigned value); -+ ++struct queue_node *front(struct link_queue *queue); ++struct queue_node *node_create(time_t time, unsigned int value); + +#endif -\ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 1149dc6..a64f20b 100644 +index 1149dc6..c9ef2fd 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -22,6 +22,44 @@ +@@ -22,6 +22,10 @@ #include "ras-report.h" #include "ras-non-standard-handler.h" #include "non-standard-ampere.h" +#include "ras-cpu-isolation.h" + ++#define ARM_ERR_VALID_ERROR_COUNT BIT(0) ++#define ARM_ERR_VALID_FLAGS BIT(1) + + void display_raw_data(struct trace_seq *s, + const uint8_t *buf, +@@ -42,6 +46,44 @@ void display_raw_data(struct trace_seq *s, + } + } + +#ifdef HAVE_CPU_FAULT_ISOLATION -+static int is_core_failure(unsigned long value) ++static int count_errors(struct ras_arm_event *ev) +{ -+ /* -+ * core failure: -+ * Bit 0\1\3: (at lease 1) -+ * Bit 2: 0 -+ */ -+ return (value & 0xf) && !(value & (0x1 << 2)); -+} -+ -+static int count_errors(struct event_format *event, const uint8_t *data, int len) -+{ -+ /* -+ * According to UEFI_2_9_2021_03_18 specification chapter N2.4.4, -+ * the length of struct processor error information is 32, the byte -+ * length of the Flags field is 1, and the byte offset is 7 in the struct. -+ */ -+ int cur_offset = 7; -+ unsigned long value; ++ struct ras_arm_err_info *err_info; ++ int num_pei; ++ int err_info_size = sizeof(struct ras_arm_err_info); + int num = 0; -+ if (len % PEI_ERR_SIZE != 0) { -+ log(TERM, LOG_ERR, "the event data does not match to the ARM Processor Error Information Structure\n"); ++ int i; ++ int error_count; ++ ++ if (ev->pei_len % err_info_size != 0) { ++ log(TERM, LOG_ERR, ++ "The event data does not match to the ARM Processor Error Information Structure\n"); + return num; + } -+ while (cur_offset < len) { -+ value = pevent_read_number(event->pevent, data+cur_offset, FLAGS_SIZE); -+ if (is_core_failure(value)) { -+ num++; -+ log(TERM, LOG_INFO, "Error in cpu core catched\n"); ++ num_pei = ev->pei_len / err_info_size; ++ err_info = (struct ras_arm_err_info *)(ev->pei_error); ++ ++ for (i = 0; i < num_pei; ++i) { ++ error_count = 1; ++ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) { ++ /* ++ * The value of this field is defined as follows: ++ * 0: Single Error ++ * 1: Multiple Errors ++ * 2-65535: Error Count ++ */ ++ error_count = err_info->multiple_error + 1; + } -+ cur_offset += PEI_ERR_SIZE; ++ ++ num += error_count; ++ err_info += 1; + } ++ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num); + return num; +} +#endif - - void display_raw_data(struct trace_seq *s, - const uint8_t *buf, -@@ -139,6 +177,41 @@ int ras_arm_event_handler(struct trace_seq *s, ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context) +@@ -139,6 +181,48 @@ int ras_arm_event_handler(struct trace_seq *s, display_raw_data(s, ev.vsei_error, ev.oem_len); #endif +#ifdef HAVE_CPU_FAULT_ISOLATION ++ int cpu; ++ int nums; ++ char *severity; ++ struct error_info err_info; ++ ++ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) ++ return -1; ++ cpu = val; ++ trace_seq_printf(s, "\n cpu: %d", cpu); ++ + /* record cpu error */ + if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) + return -1; -+ /* refer to UEFI_2_9_2021_03_18 specification chapter N2.2 Table N-5 */ ++ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */ + switch (val) { + case GHES_SEV_NO: -+ ev.severity = "Informational"; ++ severity = "Informational"; + break; + case GHES_SEV_CORRECTED: -+ ev.severity = "Corrected"; ++ severity = "Corrected"; + break; + case GHES_SEV_RECOVERABLE: -+ ev.severity = "Recoverable"; ++ severity = "Recoverable"; + break; + default: + case GHES_SEV_PANIC: -+ ev.severity = "Fatal"; ++ severity = "Fatal"; + } ++ trace_seq_printf(s, "\n severity: %s", severity); + -+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { -+ int len, nums; -+ ev.error_info = pevent_get_field_raw(s, event, "buf", record, &len, 1); -+ if (!ev.error_info) -+ return -1; -+ ev.length = len; -+ /* relate to enum error_type */ -+ nums = count_errors(event, ev.error_info, len); ++ if (val == GHES_SEV_CORRECTED) { ++ nums = count_errors(&ev); + if (nums > 0) { -+ struct error_info err_info = {nums, now, val}; -+ ras_record_cpu_error(&err_info, ev.mpidr); ++ err_info.nums = nums; ++ err_info.time = now; ++ err_info.err_type = val; ++ ras_record_cpu_error(&err_info, cpu); + } + } +#endif @@ -372,12 +381,41 @@ index 1149dc6..a64f20b 100644 /* Insert data into the SGBD */ #ifdef HAVE_SQLITE3 ras_store_arm_record(ras, &ev); +diff --git a/ras-arm-handler.h b/ras-arm-handler.h +index 563a2d3..52813e7 100644 +--- a/ras-arm-handler.h ++++ b/ras-arm-handler.h +@@ -17,6 +17,24 @@ + #include "ras-events.h" + #include "libtrace/event-parse.h" + ++/* ++ * ARM Processor Error Information Structure, According to ++ * UEFI_2_9 specification chapter N2.4.4. ++ */ ++#pragma pack(1) ++struct ras_arm_err_info { ++ uint8_t version; ++ uint8_t length; ++ uint16_t validation_bits; ++ uint8_t type; ++ uint16_t multiple_error; ++ uint8_t flags; ++ uint64_t error_info; ++ uint64_t virt_fault_addr; ++ uint64_t physical_fault_addr; ++}; ++#pragma pack() ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..6dcff70 +index 0000000..8c0cdf9 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,499 @@ +@@ -0,0 +1,378 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -390,7 +428,7 @@ index 0000000..6dcff70 + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ + +#include +#include @@ -403,486 +441,365 @@ index 0000000..6dcff70 +#include "ras-logger.h" +#include "ras-cpu-isolation.h" + -+static struct cpu_info *cpu_infos = NULL; -+static unsigned int ncores, cores_per_socket, cores_per_die; -+static unsigned int sockets, dies = 1; ++static struct cpu_info *cpu_infos; ++static unsigned int ncores; +static unsigned int enabled = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -+static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; -+static const char *node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { -+ { "", 1 }, -+ {} ++ {"", 1}, ++ {} +}; + +static const struct param cycle_units[] = { -+ { "d", 24 * 60 * 60 }, -+ { "h", 60 * 60 }, -+ { "m", 60 }, -+ { "s", 1 }, -+ {} ++ {"d", 24 * 60 * 60}, ++ {"h", 60 * 60}, ++ {"m", 60}, ++ {"s", 1}, ++ {} +}; + +static struct isolation_param threshold = { -+ .name = "CPU_CE_THRESHOLD", -+ .units = normal_units, -+ .value = 18, -+ .limit = 10000 ++ .name = "CPU_CE_THRESHOLD", ++ .units = normal_units, ++ .value = 18, ++ .limit = 10000 +}; + +static struct isolation_param cpu_limit = { -+ .name = "CPU_ISOLATION_LIMIT", -+ .units = normal_units ++ .name = "CPU_ISOLATION_LIMIT", ++ .units = normal_units +}; + +static struct isolation_param cycle = { -+ .name = "CPU_ISOLATION_CYCLE", -+ .units = cycle_units, -+ .value = 24 * 60 * 60, -+ .limit = 30 * 24 * 60 * 60 ++ .name = "CPU_ISOLATION_CYCLE", ++ .units = cycle_units, ++ .value = 24 * 60 * 60, ++ .limit = 30 * 24 * 60 * 60 +}; + -+static const char *cpu_state[] = { -+ [CPU_OFFLINE] = "offline", -+ [CPU_ONLINE] = "online", -+ [CPU_OFFLINE_FAILED] = "offline-failed", -+ [CPU_UNKNOWN] = "unknown" ++static const char * const cpu_state[] = { ++ [CPU_OFFLINE] = "offline", ++ [CPU_ONLINE] = "online", ++ [CPU_OFFLINE_FAILED] = "offline-failed", ++ [CPU_UNKNOWN] = "unknown" +}; + -+static int open_sys_file(unsigned cpu, int __oflag, const char *format) ++static int open_sys_file(unsigned int cpu, int __oflag, const char *format) +{ -+ int fd; -+ char buf[MAX_PATH_LEN] = ""; -+ snprintf(buf, sizeof(buf), format, cpu); -+ fd = open(buf, __oflag); -+ -+ if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); -+ return -1; -+ } ++ int fd; ++ char buf[MAX_PATH_LEN] = ""; + -+ return fd; -+} ++ snprintf(buf, sizeof(buf), format, cpu); ++ fd = open(buf, __oflag); + -+static int get_sockets(void) -+{ -+ int fd, j; -+ char buf[MAX_BUF_LEN] = ""; -+ cores_per_socket = ncores; -+ struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); -+ -+ if (!cpu_sets) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); -+ return -1; -+ } -+ -+ for (int i = 0; i < ncores; ++i) { -+ fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); -+ if (fd == -1) { -+ continue; -+ } -+ memset(buf, '\0', strlen(buf)); -+ if (read(fd, buf, sizeof(buf)) <= 0) { -+ close(fd); -+ continue; -+ } -+ for (j = 0; j < sockets; ++j) { -+ if (strcmp(cpu_sets[j].buf, buf) == 0) { -+ break; -+ } -+ } -+ if (j == sockets) { -+ strcpy(cpu_sets[sockets].buf, buf); -+ sockets++; -+ } -+ close(fd); -+ } -+ -+ free(cpu_sets); -+ cores_per_socket = sockets > 0 ? ncores / sockets : ncores; -+ -+ return 0; ++ if (fd == -1) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); ++ return -1; ++ } ++ ++ return fd; +} + -+static int get_dies(void) ++static int get_cpu_status(unsigned int cpu) +{ -+ int fd, begin, end; -+ char buf[20] = ""; -+ cores_per_die = ncores; -+ fd = open(node_path, O_RDONLY); ++ int fd, num; ++ char buf[2] = ""; + -+ if (fd == -1) { -+ return -1; -+ } ++ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); ++ if (fd == -1) ++ return CPU_UNKNOWN; + -+ if (read(fd, buf, sizeof(buf))) { -+ if (sscanf(buf, "%d-%d", &begin, &end) == 2) { -+ dies = end > begin ? end - begin + 1 : 1; -+ } -+ } ++ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) ++ num = CPU_UNKNOWN; + -+ close(fd); -+ cores_per_die = ncores / dies; ++ close(fd); + -+ return 0; ++ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; +} + -+static int get_cpu_status(unsigned cpu) ++static int init_cpu_info(unsigned int cpus) +{ -+ int fd, num; -+ char buf[2] = ""; -+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); -+ -+ if (fd == -1) { -+ return CPU_UNKNOWN; -+ } ++ ncores = cpus; ++ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); + -+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) { -+ num = CPU_UNKNOWN; -+ } ++ if (!cpu_infos) { ++ log(TERM, LOG_ERR, ++ "Failed to allocate memory for cpu infos in %s.\n", __func__); ++ return -1; ++ } + -+ close(fd); ++ for (unsigned int i = 0; i < cpus; ++i) { ++ cpu_infos[i].ce_nums = 0; ++ cpu_infos[i].state = get_cpu_status(i); ++ cpu_infos[i].ce_queue = init_queue(); + -+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; -+} ++ if (cpu_infos[i].ce_queue == NULL) { ++ log(TERM, LOG_ERR, ++ "Failed to allocate memory for cpu ce queue in %s.\n", __func__); ++ return -1; ++ } ++ } ++ /* set limit of offlined cpu limit according to number of cpu */ ++ cpu_limit.limit = cpus - 1; ++ cpu_limit.value = 0; + -+static int init_cpu_info(unsigned cpus) -+{ -+ ncores = cpus; -+ cpu_infos = (struct cpu_info *) malloc(sizeof(*cpu_infos) * cpus); -+ -+ if (!cpu_infos) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for cpu infos in %s.\n", __func__); -+ return -1; -+ } -+ -+ for (unsigned int i = 0; i < cpus; ++i) { -+ cpu_infos[i].state = get_cpu_status(i); -+ cpu_infos[i].ce_queue = init_queue(); -+ if (cpu_infos[i].ce_queue == NULL) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for cpu ce queue in %s.\n", __func__); -+ return -1; -+ } -+ } -+ /* set limit of offlined cpu limit according to number of cpu */ -+ cpu_limit.limit = cpus - 1; -+ cpu_limit.value = 0; -+ -+ if (get_sockets() < 0 || get_dies() < 0) { -+ log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); -+ return -1; -+ } -+ -+ return 0; ++ return 0; +} + +static void check_config(struct isolation_param *config) +{ -+ if (config->value > config->limit) { -+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", -+ config->value, config->limit); -+ config->value = config->limit; -+ } ++ if (config->value > config->limit) { ++ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", ++ config->value, config->limit); ++ config->value = config->limit; ++ } +} + +static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) +{ -+ int env_size, has_unit = 0; -+ -+ if (!env || strlen(env) == 0) { -+ return -1; -+ } -+ -+ env_size = strlen(env); -+ char *unit = NULL; -+ unit = env + env_size - 1; -+ -+ if (isalpha(*unit)) { -+ has_unit = 1; -+ env_size--; -+ if (env_size <= 0) { -+ return -1; -+ } -+ } -+ -+ for (int i = 0; i < env_size; ++i) { -+ if (isdigit(env[i])) { -+ if (*value > ULONG_MAX / 10 || (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { -+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = 10 * (*value) + (env[i] - '0'); -+ } -+ else { -+ return -1; -+ } -+ } -+ -+ if (has_unit) { -+ for (const struct param *units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ if (*value > (ULONG_MAX / units->value)) { -+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = (*value) * units->value; -+ return 0; -+ } -+ } -+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); -+ return -1; -+ } -+ -+ return 0; ++ char *unit = NULL; ++ int env_size, has_unit = 0; ++ ++ if (!env || strlen(env) == 0) ++ return -1; ++ ++ env_size = strlen(env); ++ unit = env + env_size - 1; ++ ++ if (isalpha(*unit)) { ++ has_unit = 1; ++ env_size--; ++ if (env_size <= 0) ++ return -1; ++ } ++ ++ for (int i = 0; i < env_size; ++i) { ++ if (isdigit(env[i])) { ++ if (*value > ULONG_MAX / 10 || ++ (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { ++ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = 10 * (*value) + (env[i] - '0'); ++ } else ++ return -1; ++ } ++ ++ if (has_unit) { ++ for (const struct param *units = config->units; units->name; units++) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ if (*value > (ULONG_MAX / units->value)) { ++ log(TERM, LOG_ERR, ++ "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = (*value) * units->value; ++ return 0; ++ } ++ } ++ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); ++ return -1; ++ } ++ ++ return 0; +} + +static void init_config(struct isolation_param *config) +{ -+ char *env = getenv(config->name); -+ unsigned long value = 0; ++ char *env = getenv(config->name); ++ unsigned long value = 0; + -+ if (parse_ul_config(config, env, &value) < 0) { -+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", -+ config->name, env, config->value); -+ return; -+ } ++ if (parse_ul_config(config, env, &value) < 0) { ++ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", ++ config->name, env, config->value); ++ return; ++ } + -+ config->value = value; -+ check_config(config); ++ config->value = value; ++ check_config(config); +} + +static int check_config_status(void) +{ -+ char *env = getenv("CPU_ISOLATION_ENABLE"); ++ char *env = getenv("CPU_ISOLATION_ENABLE"); + -+ if (env == NULL || strcasecmp(env, "yes")) { -+ return -1; -+ } ++ if (env == NULL || strcasecmp(env, "yes")) ++ return -1; + -+ return 0; ++ return 0; +} + -+void ras_error_count_init(unsigned cpus) ++void ras_cpu_isolation_init(unsigned int cpus) +{ -+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { -+ enabled = 0; -+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); -+ init_config(&threshold); -+ init_config(&cpu_limit); -+ init_config(&cycle); ++ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { ++ enabled = 0; ++ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); ++ return; ++ } ++ ++ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); ++ init_config(&threshold); ++ init_config(&cpu_limit); ++ init_config(&cycle); +} + +void cpu_infos_free(void) +{ -+ if (cpu_infos) { -+ for (int i = 0; i < ncores; ++i) { -+ free_queue(cpu_infos[i].ce_queue); -+ } -+ free(cpu_infos); -+ } ++ if (cpu_infos) { ++ for (int i = 0; i < ncores; ++i) ++ free_queue(cpu_infos[i].ce_queue); ++ ++ free(cpu_infos); ++ } +} + -+static int do_cpu_offline(unsigned cpu) ++static int do_cpu_offline(unsigned int cpu) +{ -+ int fd, rc; -+ char buf[2] = ""; -+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; -+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); ++ int fd, rc; ++ char buf[2] = ""; + -+ if (fd == -1) { -+ return HANDLE_FAILED; -+ } ++ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; ++ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); ++ if (fd == -1) ++ return HANDLE_FAILED; + -+ strcpy(buf, "0"); -+ rc = write(fd, buf, strlen(buf)); ++ strcpy(buf, "0"); ++ rc = write(fd, buf, strlen(buf)); + -+ if (rc < 0) { -+ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); -+ close(fd); -+ return HANDLE_FAILED; -+ } ++ if (rc < 0) { ++ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ close(fd); ++ return HANDLE_FAILED; ++ } + -+ close(fd); -+ /* check wthether the cpu is isolated successfully */ -+ cpu_infos[cpu].state = get_cpu_status(cpu); ++ close(fd); ++ /* check wthether the cpu is isolated successfully */ ++ cpu_infos[cpu].state = get_cpu_status(cpu); + -+ if (cpu_infos[cpu].state == CPU_OFFLINE) { -+ return HANDLE_SUCCEED; -+ } ++ if (cpu_infos[cpu].state == CPU_OFFLINE) ++ return HANDLE_SUCCEED; + -+ return HANDLE_FAILED; ++ return HANDLE_FAILED; +} + -+static int do_ce_handler(unsigned cpu) ++static int do_ce_handler(unsigned int cpu) +{ -+ struct link_queue *queue = cpu_infos[cpu].ce_queue; -+ unsigned tmp; -+ /* -+ * Since we just count all error numbers in setted cycle, we store the time -+ * and error numbers from current event to the queue, then everytime we -+ * calculate the period from beginning time to ending time, if the period -+ * exceeds setted cycle, we pop the beginning time and error until the period -+ * from new beginning time to ending time is less than cycle. -+ */ -+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { -+ tmp = queue->head->value; -+ if (pop(queue) == 0) { -+ cpu_infos[cpu].ce_nums -= tmp; -+ } -+ } -+ -+ if (cpu_infos[cpu].ce_nums >= threshold.value) { -+ log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", -+ threshold.value, cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; ++ struct link_queue *queue = cpu_infos[cpu].ce_queue; ++ unsigned int tmp; ++ /* ++ * Since we just count all error numbers in setted cycle, we store the time ++ * and error numbers from current event to the queue, then everytime we ++ * calculate the period from beginning time to ending time, if the period ++ * exceeds setted cycle, we pop the beginning time and error until the period ++ * from new beginning time to ending time is less than cycle. ++ */ ++ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { ++ tmp = queue->head->value; ++ if (pop(queue) == 0) ++ cpu_infos[cpu].ce_nums -= tmp; ++ } ++ log(TERM, LOG_INFO, ++ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n", ++ cpu, cpu_infos[cpu].ce_nums); ++ ++ if (cpu_infos[cpu].ce_nums >= threshold.value) { ++ log(TERM, LOG_INFO, ++ "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", ++ threshold.value, cpu); ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; +} + -+static int do_uce_handler(unsigned cpu) ++static int error_handler(unsigned int cpu, struct error_info *err_info) +{ -+ if (cpu_infos[cpu].uce_nums > 0) { -+ log(TERM, LOG_INFO, "Uncorrected Errors occured, try to offline cpu%d\n", cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; -+} ++ int ret = HANDLE_NOTHING; + -+static int error_handler(unsigned cpu, struct error_info *err_info) -+{ -+ int ret = HANDLE_NOTHING; -+ -+ switch (err_info->err_type) -+ { -+ case CE: -+ ret = do_ce_handler(cpu); -+ break; -+ case UCE: -+ ret = do_uce_handler(cpu); -+ break; -+ default: -+ break; -+ } -+ -+ return ret; ++ switch (err_info->err_type) { ++ case CE: ++ ret = do_ce_handler(cpu); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; +} + -+static void record_error_info(unsigned cpu, struct error_info *err_info) ++static void record_error_info(unsigned int cpu, struct error_info *err_info) +{ -+ switch (err_info->err_type) -+ { -+ case CE: -+ { -+ struct queue_node *node = NULL; -+ node = node_create(err_info->time, err_info->nums); -+ if (node == NULL) { -+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); -+ return; -+ } -+ push(cpu_infos[cpu].ce_queue, node); -+ cpu_infos[cpu].ce_nums += err_info->nums; -+ break; -+ } -+ case UCE: -+ cpu_infos[cpu].uce_nums++; -+ break; -+ default: -+ break; -+ } ++ switch (err_info->err_type) { ++ case CE: ++ { ++ struct queue_node *node = node_create(err_info->time, err_info->nums); ++ ++ if (node == NULL) { ++ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); ++ return; ++ } ++ push(cpu_infos[cpu].ce_queue, node); ++ cpu_infos[cpu].ce_nums += err_info->nums; ++ break; ++ } ++ default: ++ break; ++ } +} + -+static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) ++void ras_record_cpu_error(struct error_info *err_info, int cpu) +{ -+ value >>= offset; -+ unsigned long res = 0; -+ int i = 0; ++ int ret; + -+ while (i < size) { -+ res |= (value & (0x1 << (i++))); -+ } ++ if (enabled == 0) ++ return; + -+ return res; -+} ++ if (cpu >= ncores || cpu < 0) { ++ log(TERM, LOG_ERR, ++ "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); ++ return; ++ } + -+static unsigned get_cpu_index(int64_t mpidr) -+{ -+ unsigned core_id, socket_id, die_id, cpu; -+ /* -+ * Adapt to certain BIOS -+ * In the MPIDR: -+ * bit 8:15: core id -+ * bit 19:20: die_id -+ * bit 21:22: socket_id -+ */ -+ core_id = get_bit_value(mpidr, 8, 8); -+ socket_id = get_bit_value(mpidr, 21, 2); -+ die_id = get_bit_value(mpidr, 19, 2); -+ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die; -+ -+ return cpu; -+} ++ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); ++ cpu_infos[cpu].state = get_cpu_status(cpu); + -+void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) -+{ -+ unsigned cpu; -+ int ret; -+ -+ if (enabled == 0) { -+ return; -+ } -+ -+ cpu = get_cpu_index(mpidr); -+ -+ if (cpu >= ncores) { -+ log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); -+ cpu_infos[cpu].state = get_cpu_status(cpu); -+ -+ if (cpu_infos[cpu].state != CPU_ONLINE) { -+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); -+ return; -+ } -+ -+ record_error_info(cpu, err_info); -+ /* Since user may change cpu state, we get current offlined cpu numbers every recording time. */ -+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { -+ log(TERM, LOG_WARNING, "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", -+ cpu_limit.value); -+ return; -+ } -+ -+ ret = error_handler(cpu, err_info); -+ -+ if (ret == HANDLE_NOTHING) { -+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); -+ } -+ else if (ret == HANDLE_SUCCEED) { -+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+ clear_queue(cpu_infos[cpu].ce_queue); -+ } -+ else { -+ log(TERM, LOG_INFO, "Offline cpu%d fail, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+ } -+ -+ return; ++ if (cpu_infos[cpu].state != CPU_ONLINE) { ++ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); ++ return; ++ } ++ ++ record_error_info(cpu, err_info); ++ /* ++ * Since user may change cpu state, we get current offlined ++ * cpu numbers every recording time. ++ */ ++ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { ++ log(TERM, LOG_WARNING, ++ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", ++ cpu_limit.value); ++ return; ++ } ++ ++ ret = error_handler(cpu, err_info); ++ ++ if (ret == HANDLE_NOTHING) ++ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); ++ else if (ret == HANDLE_SUCCEED) { ++ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); ++ clear_queue(cpu_infos[cpu].ce_queue); ++ cpu_infos[cpu].ce_nums = 0; ++ } else ++ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..a7d3fdb +index 0000000..1159853 --- /dev/null +++ b/ras-cpu-isolation.h -@@ -0,0 +1,76 @@ +@@ -0,0 +1,68 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -895,7 +812,7 @@ index 0000000..a7d3fdb + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ + +#ifndef __RAS_CPU_ISOLATION_H +#define __RAS_CPU_ISOLATION_H @@ -904,64 +821,55 @@ index 0000000..a7d3fdb + +#define MAX_PATH_LEN 100 +#define MAX_BUF_LEN 1024 -+#define PEI_ERR_SIZE 32 -+#define FLAGS_SIZE 1 + +struct param { -+ char *name; -+ unsigned long value; ++ char *name; ++ unsigned long value; +}; + +struct isolation_param { -+ char *name; -+ const struct param *units; -+ unsigned long value; -+ unsigned long limit; ++ char *name; ++ const struct param *units; ++ unsigned long value; ++ unsigned long limit; +}; + +enum cpu_state { -+ CPU_OFFLINE, -+ CPU_ONLINE, -+ CPU_OFFLINE_FAILED, -+ CPU_UNKNOWN, ++ CPU_OFFLINE, ++ CPU_ONLINE, ++ CPU_OFFLINE_FAILED, ++ CPU_UNKNOWN, +}; + +enum error_handle_result { -+ HANDLE_FAILED = -1, -+ HANDLE_SUCCEED, -+ HANDLE_NOTHING, ++ HANDLE_FAILED = -1, ++ HANDLE_SUCCEED, ++ HANDLE_NOTHING, +}; + +enum error_type { -+ CE = 1, -+ UCE ++ CE = 1 +}; + +struct cpu_info { -+ unsigned long uce_nums; -+ unsigned long ce_nums; -+ struct link_queue *ce_queue; -+ enum cpu_state state; ++ unsigned long ce_nums; ++ struct link_queue *ce_queue; ++ enum cpu_state state; +}; + +struct error_info { -+ unsigned long nums; -+ time_t time; -+ enum error_type err_type; ++ unsigned long nums; ++ time_t time; ++ enum error_type err_type; +}; + -+struct cpu_set { -+ char buf[MAX_BUF_LEN]; -+}; -+ -+void ras_error_count_init(unsigned cpus); -+void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); ++void ras_cpu_isolation_init(unsigned int cpus); ++void ras_record_cpu_error(struct error_info *err_info, int cpu); +void cpu_infos_free(void); + +#endif -\ No newline at end of file diff --git a/ras-events.c b/ras-events.c -index ba769d1..00938e6 100644 +index ba769d1..491c17a 100644 --- a/ras-events.c +++ b/ras-events.c @@ -41,6 +41,7 @@ @@ -977,38 +885,22 @@ index ba769d1..00938e6 100644 cpus = get_num_cpus(ras); +#ifdef HAVE_CPU_FAULT_ISOLATION -+ ras_error_count_init(cpus); ++ ras_cpu_isolation_init(cpus); +#endif + #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -1005,6 +1010,9 @@ err: +@@ -1005,6 +1010,8 @@ err: } free(ras); } +- +#ifdef HAVE_CPU_FAULT_ISOLATION + cpu_infos_free(); +#endif - return rc; } -diff --git a/ras-record.h b/ras-record.h -index d9f7733..efaffa5 100644 ---- a/ras-record.h -+++ b/ras-record.h -@@ -83,6 +83,11 @@ struct ras_arm_event { - uint32_t ctx_len; - const uint8_t *vsei_error; - uint32_t oem_len; -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ const char *severity; -+ const uint8_t *error_info; -+ uint32_t length; -+#endif - }; - - struct devlink_event { -- 2.27.0 diff --git a/0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch b/0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch new file mode 100644 index 0000000..d15a714 --- /dev/null +++ b/0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch @@ -0,0 +1,224 @@ +From 62218a9c3aec44330ce3b77f3634c788b6e6f60c Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Wed, 2 Mar 2022 12:20:40 +0000 +Subject: [PATCH 1/6] rasdaemon: Modify recording Hisilicon common error data + +The error statistics for the Hisilicon common +error need to do based on module, error severity etc. + +Modify recording Hisilicon common error data as separate fields +in the sql db table instead of the combined single field. + +Signed-off-by: Shiju Jose +--- + non-standard-hisilicon.c | 122 ++++++++++++++++++++++++++++++++------- + 1 file changed, 102 insertions(+), 20 deletions(-) + +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 1432163..dc69d46 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -17,6 +17,7 @@ + #include "non-standard-hisilicon.h" + + #define HISI_BUF_LEN 2048 ++#define HISI_PCIE_INFO_BUF_LEN 256 + + struct hisi_common_error_section { + uint32_t val_bits; +@@ -63,12 +64,25 @@ enum { + enum { + HISI_COMMON_FIELD_ID, + HISI_COMMON_FIELD_TIMESTAMP, +- HISI_COMMON_FIELD_ERR_INFO, ++ HISI_COMMON_FIELD_VERSION, ++ HISI_COMMON_FIELD_SOC_ID, ++ HISI_COMMON_FIELD_SOCKET_ID, ++ HISI_COMMON_FIELD_TOTEM_ID, ++ HISI_COMMON_FIELD_NIMBUS_ID, ++ HISI_COMMON_FIELD_SUB_SYSTEM_ID, ++ HISI_COMMON_FIELD_MODULE_ID, ++ HISI_COMMON_FIELD_SUB_MODULE_ID, ++ HISI_COMMON_FIELD_CORE_ID, ++ HISI_COMMON_FIELD_PORT_ID, ++ HISI_COMMON_FIELD_ERR_TYPE, ++ HISI_COMMON_FIELD_PCIE_INFO, ++ HISI_COMMON_FIELD_ERR_SEVERITY, + HISI_COMMON_FIELD_REGS_DUMP, + }; + + struct hisi_event { + char error_msg[HISI_BUF_LEN]; ++ char pcie_info[HISI_PCIE_INFO_BUF_LEN]; + char reg_msg[HISI_BUF_LEN]; + }; + +@@ -134,12 +148,24 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) + static const struct db_fields hisi_common_section_fields[] = { + { .name = "id", .type = "INTEGER PRIMARY KEY" }, + { .name = "timestamp", .type = "TEXT" }, +- { .name = "err_info", .type = "TEXT" }, ++ { .name = "version", .type = "INTEGER" }, ++ { .name = "soc_id", .type = "INTEGER" }, ++ { .name = "socket_id", .type = "INTEGER" }, ++ { .name = "totem_id", .type = "INTEGER" }, ++ { .name = "nimbus_id", .type = "INTEGER" }, ++ { .name = "sub_system_id", .type = "INTEGER" }, ++ { .name = "module_id", .type = "TEXT" }, ++ { .name = "sub_module_id", .type = "INTEGER" }, ++ { .name = "core_id", .type = "INTEGER" }, ++ { .name = "port_id", .type = "INTEGER" }, ++ { .name = "err_type", .type = "INTEGER" }, ++ { .name = "pcie_info", .type = "TEXT" }, ++ { .name = "err_severity", .type = "TEXT" }, + { .name = "regs_dump", .type = "TEXT" }, + }; + + static const struct db_table_descriptor hisi_common_section_tab = { +- .name = "hisi_common_section", ++ .name = "hisi_common_section_v2", + .fields = hisi_common_section_fields, + .num_fields = ARRAY_SIZE(hisi_common_section_fields), + }; +@@ -199,12 +225,20 @@ static const char* get_soc_desc(uint8_t soc_id) + return soc_desc[soc_id]; + } + +-static void decode_module(struct hisi_event *event, uint8_t module_id) ++static void decode_module(struct ras_ns_ev_decoder *ev_decoder, ++ struct hisi_event *event, uint8_t module_id) + { +- if (module_id >= sizeof(module_name)/sizeof(char *)) ++ if (module_id >= sizeof(module_name)/sizeof(char *)) { + HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id); +- else ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_MODULE_ID, ++ 0, "unknown"); ++ } else { + HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_MODULE_ID, ++ 0, module_name[module_id]); ++ } + } + + static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, +@@ -212,43 +246,93 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, + struct hisi_event *event) + { + HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version); +- if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_VERSION, ++ err->version, NULL); ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) { + HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id)); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SOC_ID, ++ err->soc_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) { + HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SOCKET_ID, ++ err->socket_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) { + HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_TOTEM_ID, ++ err->totem_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) { + HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_NIMBUS_ID, ++ err->nimbus_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) { + HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SUB_SYSTEM_ID, ++ err->subsystem_id, NULL); ++ } + + if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID)) +- decode_module(event, err->module_id); ++ decode_module(ev_decoder, event, err->module_id); + +- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) { + HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SUB_MODULE_ID, ++ err->submodule_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) { + HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_CORE_ID, ++ err->core_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) { + HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_PORT_ID, ++ err->port_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) { + HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_ERR_TYPE, ++ err->err_type, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) { + HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x", + err->pcie_info.segment, err->pcie_info.bus, + err->pcie_info.device, err->pcie_info.function); ++ HISI_SNPRINTF(event->pcie_info, "%04x:%02x:%02x.%x", ++ err->pcie_info.segment, err->pcie_info.bus, ++ err->pcie_info.device, err->pcie_info.function); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_PCIE_INFO, ++ 0, event->pcie_info); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) { + HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity)); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_ERR_SEVERITY, ++ 0, err_severity(err->err_severity)); ++ } + + HISI_SNPRINTF(event->error_msg, "]"); + } +@@ -293,8 +377,6 @@ static int decode_hisi_common_section(struct ras_events *ras, + record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, + HISI_COMMON_FIELD_TIMESTAMP, + 0, event->timestamp); +- record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, +- HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg); + record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, + HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg); + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); +-- +2.25.1 + diff --git a/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch b/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch new file mode 100644 index 0000000..aa1b251 --- /dev/null +++ b/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch @@ -0,0 +1,138 @@ +From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001 +From: Shengwei Luo +Date: Wed, 23 Feb 2022 17:23:27 +0800 +Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors + +When the recoverable errors in cpu core occurred, try to offline +the related cpu core. + +Signed-off-by: Shengwei Luo +--- + ras-arm-handler.c | 21 ++++++++++++++++++--- + ras-cpu-isolation.c | 17 +++++++++++++++++ + ras-cpu-isolation.h | 4 +++- + 3 files changed, 38 insertions(+), 4 deletions(-) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index c9ef2fd..dae5ad6 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s, + } + + #ifdef HAVE_CPU_FAULT_ISOLATION +-static int count_errors(struct ras_arm_event *ev) ++static int is_core_failure(struct ras_arm_err_info *err_info) ++{ ++ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) { ++ /* ++ * core failure: ++ * Bit 0\1\3: (at lease 1) ++ * Bit 2: 0 ++ */ ++ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2)); ++ } ++ return 0; ++} ++ ++static int count_errors(struct ras_arm_event *ev, int sev) + { + struct ras_arm_err_info *err_info; + int num_pei; +@@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev) + */ + error_count = err_info->multiple_error + 1; + } ++ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info)) ++ error_count = 0; + + num += error_count; + err_info += 1; +@@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s, + } + trace_seq_printf(s, "\n severity: %s", severity); + +- if (val == GHES_SEV_CORRECTED) { +- nums = count_errors(&ev); ++ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { ++ nums = count_errors(&ev, val); + if (nums > 0) { + err_info.nums = nums; + err_info.time = now; +diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c +index 8c0cdf9..e650022 100644 +--- a/ras-cpu-isolation.c ++++ b/ras-cpu-isolation.c +@@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus) + + for (unsigned int i = 0; i < cpus; ++i) { + cpu_infos[i].ce_nums = 0; ++ cpu_infos[i].uce_nums = 0; + cpu_infos[i].state = get_cpu_status(i); + cpu_infos[i].ce_queue = init_queue(); + +@@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu) + return HANDLE_NOTHING; + } + ++static int do_uce_handler(unsigned int cpu) ++{ ++ if (cpu_infos[cpu].uce_nums > 0) { ++ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu); ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; ++} ++ + static int error_handler(unsigned int cpu, struct error_info *err_info) + { + int ret = HANDLE_NOTHING; +@@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) + case CE: + ret = do_ce_handler(cpu); + break; ++ case UCE: ++ ret = do_uce_handler(cpu); ++ break; + default: + break; + } +@@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) + cpu_infos[cpu].ce_nums += err_info->nums; + break; + } ++ case UCE: ++ cpu_infos[cpu].uce_nums++; ++ break; + default: + break; + } +@@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) + cpu, cpu_state[cpu_infos[cpu].state]); + clear_queue(cpu_infos[cpu].ce_queue); + cpu_infos[cpu].ce_nums = 0; ++ cpu_infos[cpu].uce_nums = 0; + } else + log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", + cpu, cpu_state[cpu_infos[cpu].state]); +diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h +index 1159853..024a68b 100644 +--- a/ras-cpu-isolation.h ++++ b/ras-cpu-isolation.h +@@ -46,10 +46,12 @@ enum error_handle_result { + }; + + enum error_type { +- CE = 1 ++ CE = 1, ++ UCE + }; + + struct cpu_info { ++ unsigned long uce_nums; + unsigned long ce_nums; + struct link_queue *ce_queue; + enum cpu_state state; +-- +2.27.0 + diff --git a/0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch b/0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch new file mode 100644 index 0000000..7f7eb24 --- /dev/null +++ b/0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch @@ -0,0 +1,97 @@ +From 4d9f297028ce3116eaf574b2570d71a4ed666b7d Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 24 Feb 2022 18:02:14 +0000 +Subject: [PATCH 2/6] rasdaemon: ras-mc-ctl: Modify error statistics for + HiSilicon Kunpeng9xx common errors + +Modify the error statistics for the HiSilicon Kunpeng9xx platforms common errors +to display the statistics and error info based on the module and the error severity. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 40 +++++++++++++++++++++++++++++----------- + 1 file changed, 29 insertions(+), 11 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1e3aeb7..22ba1fd 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1535,7 +1535,7 @@ sub vendor_errors_summary + require DBI; + my ($num_args, $platform_id); + my ($query, $query_handle, $count, $out); +- my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info); ++ my ($module_id, $sub_module_id, $err_severity, $err_sev); + + $num_args = $#ARGV + 1; + $platform_id = 0; +@@ -1612,13 +1612,18 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $query = "select err_info, count(*) from hisi_common_section"; ++ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +- $query_handle->bind_columns(\($err_info, $count)); ++ $query_handle->bind_columns(\($err_severity, $module_id, $count)); + $out = ""; ++ $err_sev = ""; + while($query_handle->fetch()) { +- $out .= "\terrors: $count\n"; ++ if ($err_severity ne $err_sev) { ++ $out .= "$err_severity errors:\n"; ++ $err_sev = $err_severity; ++ } ++ $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; +@@ -1636,8 +1641,8 @@ sub vendor_errors + require DBI; + my ($num_args, $platform_id); + my ($query, $query_handle, $id, $timestamp, $out); +- my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id); +- my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs); ++ my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); ++ my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); + + $num_args = $#ARGV + 1; + $platform_id = 0; +@@ -1725,15 +1730,28 @@ sub vendor_errors + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id"; ++ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +- $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs)); ++ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp "; +- $out .= "Error Info:$err_info \n" if ($err_info); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "totem_id=$totem_id, " if ($totem_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "core_id=$core_id, " if ($core_id); ++ $out .= "port_id=$port_id, " if ($port_id); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "pcie_info=$pcie_info, " if ($pcie_info); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs" if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +-- +2.25.1 + diff --git a/0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch b/0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch new file mode 100644 index 0000000..7600b58 --- /dev/null +++ b/0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch @@ -0,0 +1,56 @@ +From eb93d77b417b58cba27799ae85747b8a193cf063 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sat, 5 Mar 2022 16:18:55 +0000 +Subject: [PATCH 3/6] rasdaemon: ras-mc-ctl: Reformat error info of the + HiSilicon Kunpeng920 + +Reformat the code to display the error info of HiSilicon Kunpeng920. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 22ba1fd..eeaf885 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1669,8 +1669,9 @@ sub vendor_errors + $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); + $out .= "module_id=$module_id, " if ($module_id); + $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, \n" if ($err_severity); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; +@@ -1692,8 +1693,9 @@ sub vendor_errors + $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); + $out .= "module_id=$module_id, " if ($module_id); + $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, \n" if ($err_severity); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; +@@ -1717,8 +1719,9 @@ sub vendor_errors + $out .= "core_id=$core_id, " if ($core_id); + $out .= "port_id=$port_id, " if ($port_id); + $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "err_type=$err_type, \n" if ($err_type); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; +-- +2.25.1 + diff --git a/0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch b/0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch new file mode 100644 index 0000000..15ab710 --- /dev/null +++ b/0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch @@ -0,0 +1,36 @@ +From 623e85c07ab21ccc89ffe2bb444eb000a2664a9d Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sat, 5 Mar 2022 17:01:35 +0000 +Subject: [PATCH 4/6] rasdaemon: ras-mc-ctl: Add printing usage if necessary + parameters are not passed for the HiSilicon vendor-errors options + +Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options of the ras-mc-ctl. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index eeaf885..0e32cb1 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1542,6 +1542,7 @@ sub vendor_errors_summary + if ($num_args ne 0) { + $platform_id = $ARGV[0]; + } else { ++ usage(1); + return; + } + +@@ -1649,6 +1650,7 @@ sub vendor_errors + if ($num_args ne 0) { + $platform_id = $ARGV[0]; + } else { ++ usage(1); + return; + } + +-- +2.25.1 + diff --git a/0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch b/0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch new file mode 100644 index 0000000..6153a85 --- /dev/null +++ b/0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch @@ -0,0 +1,198 @@ +From 4007c95f8a8d570542ffc11676b619ea5649d0e7 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sat, 5 Mar 2022 18:19:38 +0000 +Subject: [PATCH 5/6] rasdaemon: ras-mc-ctl: Add support to display the + HiSilicon vendor errors for a specified module + +Add support to display the HiSilicon vendor errors for a specified module. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 119 ++++++++++++++++++++++++--------------------- + 1 file changed, 63 insertions(+), 56 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 0e32cb1..d728300 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -96,7 +96,8 @@ Usage: $prog [OPTIONS...] + --errors Shows the errors stored at the error database. + --error-count Shows the corrected and uncorrected error counts using sysfs. + --vendor-errors-summary Presents a summary of the vendor-specific logged errors. +- --vendor-errors Shows the vendor-specific errors stored in the error database. ++ --vendor-errors Shows the vendor-specific errors stored in the error database. ++ --vendor-errors Shows the vendor-specific errors for a specific module stored in the error database. + --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. + --help This help message. + EOF +@@ -1640,15 +1641,19 @@ sub vendor_errors_summary + sub vendor_errors + { + require DBI; +- my ($num_args, $platform_id); ++ my ($num_args, $platform_id, $module); + my ($query, $query_handle, $id, $timestamp, $out); + my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); + my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); + + $num_args = $#ARGV + 1; + $platform_id = 0; ++ $module = 0; + if ($num_args ne 0) { + $platform_id = $ARGV[0]; ++ if ($num_args gt 1) { ++ $module = $ARGV[1]; ++ } + } else { + usage(1); + return; +@@ -1664,21 +1669,21 @@ sub vendor_errors + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs " if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type1 errors.\n"; + } + $query_handle->finish; + +@@ -1688,21 +1693,21 @@ sub vendor_errors + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs " if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type2 errors.\n"; + } + $query_handle->finish; + +@@ -1712,23 +1717,23 @@ sub vendor_errors + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "core_id=$core_id, " if ($core_id); +- $out .= "port_id=$port_id, " if ($port_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "err_type=$err_type, " if ($err_type); +- $out .= "Error Registers: $regs " if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($sub_module_id && ($module eq $sub_module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "core_id=$core_id, " if ($core_id); ++ $out .= "port_id=$port_id, " if ($port_id); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 PCIe controller errors.\n"; + } + $query_handle->finish; + } +@@ -1741,22 +1746,24 @@ sub vendor_errors + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "totem_id=$totem_id, " if ($totem_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "core_id=$core_id, " if ($core_id); +- $out .= "port_id=$port_id, " if ($port_id); +- $out .= "err_type=$err_type, " if ($err_type); +- $out .= "pcie_info=$pcie_info, " if ($pcie_info); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs" if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($module_id && ($module eq $module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "totem_id=$totem_id, " if ($totem_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "core_id=$core_id, " if ($core_id); ++ $out .= "port_id=$port_id, " if ($port_id); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "pcie_info=$pcie_info, " if ($pcie_info); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs" if ($regs); ++ $out .= "\n\n"; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +-- +2.25.1 + diff --git a/0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch b/0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch new file mode 100644 index 0000000..073d335 --- /dev/null +++ b/0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch @@ -0,0 +1,148 @@ +From 88bf3126312645843152c6c3215b54b120bcc1ec Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Mon, 7 Mar 2022 12:38:45 +0000 +Subject: [PATCH 6/6] rasdaemon: ras-mc-ctl: Relocate reading and display + Kunpeng920 errors to under Kunpeng9xx + +Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 38 ++++++++++---------------------------- + 1 file changed, 10 insertions(+), 28 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index d728300..2ab9602 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1527,7 +1527,6 @@ sub errors + + # Definitions of the vendor platform IDs. + use constant { +- HISILICON_KUNPENG_920 => "Kunpeng920", + HISILICON_KUNPENG_9XX => "Kunpeng9xx", + }; + +@@ -1549,8 +1548,8 @@ sub vendor_errors_summary + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng920 errors +- if ($platform_id eq HISILICON_KUNPENG_920) { ++ # HiSilicon Kunpeng9xx common errors ++ if ($platform_id eq HISILICON_KUNPENG_9XX) { + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1565,9 +1564,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n"; ++ print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1585,9 +1582,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n"; ++ print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1605,15 +1600,10 @@ sub vendor_errors_summary + $out .= "\t$sub_module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n"; ++ print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; + } + $query_handle->finish; +- } + +- # HiSilicon Kunpeng9xx common errors +- if ($platform_id eq HISILICON_KUNPENG_9XX) { + $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1629,8 +1619,6 @@ sub vendor_errors_summary + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng9xx common errors.\n\n"; + } + $query_handle->finish; + } +@@ -1661,8 +1649,8 @@ sub vendor_errors + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng920 errors +- if ($platform_id eq HISILICON_KUNPENG_920) { ++ # HiSilicon Kunpeng9xx common errors ++ if ($platform_id eq HISILICON_KUNPENG_9XX) { + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1683,7 +1671,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; ++ print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1707,7 +1695,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; ++ print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1733,13 +1721,10 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; ++ print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; + } + $query_handle->finish; +- } + +- # HiSilicon Kunpeng9xx common errors +- if ($platform_id eq HISILICON_KUNPENG_9XX) { + $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1767,8 +1752,6 @@ sub vendor_errors + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng9xx common errors.\n"; + } + $query_handle->finish; + } +@@ -1779,7 +1762,6 @@ sub vendor_errors + sub vendor_platforms + { + print "\nSupported platforms for the vendor-specific errors:\n"; +- print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n"; + print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; + print "\n"; + } +-- +2.25.1 + diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch deleted file mode 100644 index ac031b3..0000000 --- a/0007-add-trace-print-and-add-sqlite-store.patch +++ /dev/null @@ -1,78 +0,0 @@ -From 57640072aead2e00037749d66f05fc26e3fe3071 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Tue, 25 May 2021 20:07:26 +0800 -Subject: [PATCH 2/2] add trace print of new information and add it to sqilte - -Since we add new information of the event, we add trace print and store it to -Sqlite. - -Signed-off-by: Luo Shengwei ---- - ras-arm-handler.c | 10 ++++++++++ - ras-record.c | 8 ++++++++ - 2 files changed, 18 insertions(+) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 10d0099..23ad470 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -23,6 +23,13 @@ - #include "ras-cpu-isolation.h" - - #ifdef HAVE_CPU_FAULT_ISOLATION -+static void trace_print_hex(struct trace_seq *s, const uint8_t *buf, int buf_len) -+{ -+ for (int i = 0; i < buf_len; ++i) { -+ trace_seq_printf(s, "%2.2x", buf[i]); -+ } -+} -+ - static int is_core_failure(unsigned long value) - { - /* -@@ -135,6 +142,7 @@ int ras_arm_event_handler(struct trace_seq *s, - case GHES_SEV_PANIC: - ev.severity = "Fatal"; - } -+ trace_seq_printf(s, "\n severity: %s", ev.severity); - - if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { - int len, nums; -@@ -142,6 +150,8 @@ int ras_arm_event_handler(struct trace_seq *s, - if (!ev.error_info) - return -1; - ev.length = len; -+ trace_seq_printf(s, "\n processor_err_info: "); -+ trace_print_hex(s, ev.error_info, len); - /* relate to enum error_type */ - nums = count_errors(event, ev.error_info, len); - if (nums > 0) { -diff --git a/ras-record.c b/ras-record.c -index 549c494..33d4741 100644 ---- a/ras-record.c -+++ b/ras-record.c -@@ -210,6 +210,10 @@ static const struct db_fields arm_event_fields[] = { - { .name="err_info", .type="BLOB" }, - { .name="context_info", .type="BLOB" }, - { .name="vendor_info", .type="BLOB" }, -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ { .name="severity", .type="TEXT" }, -+ { .name="error_info", .type="BLOB" }, -+#endif - }; - - static const struct db_table_descriptor arm_event_tab = { -@@ -233,6 +237,10 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) - ev->ctx_error, ev->ctx_len, NULL); - sqlite3_bind_blob (priv->stmt_arm_record, 9, - ev->vsei_error, ev->oem_len, NULL); -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ sqlite3_bind_text (priv->stmt_arm_record, 7, ev->severity, -1, NULL); -+ sqlite3_bind_blob (priv->stmt_arm_record, 8, ev->error_info, ev->length, NULL); -+#endif - - rc = sqlite3_step(priv->stmt_arm_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) --- -2.27.0 - diff --git a/0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch b/0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch deleted file mode 100644 index 38ef9ac..0000000 --- a/0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 6b767a2fce615384f062ecb392cd332452bf4482 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Wed, 1 Sep 2021 21:00:16 +0800 -Subject: [PATCH] modify cpu parse for adapting to new bios version - ---- - ras-cpu-isolation.c | 20 ++++++++++++++++++-- - 1 file changed, 18 insertions(+), 2 deletions(-) - -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index 6dcff70..b1643c4 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -25,6 +25,7 @@ - - static struct cpu_info *cpu_infos = NULL; - static unsigned int ncores, cores_per_socket, cores_per_die; -+static unsigned int cores_per_cluster = 4; - static unsigned int sockets, dies = 1; - static unsigned int enabled = 1; - static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -@@ -432,18 +433,33 @@ static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size - - static unsigned get_cpu_index(int64_t mpidr) - { -- unsigned core_id, socket_id, die_id, cpu; -+ unsigned core_id, cluster_id, socket_id, die_id, cpu; - /* - * Adapt to certain BIOS - * In the MPIDR: - * bit 8:15: core id -+ * bit 16:18: cluster id - * bit 19:20: die_id - * bit 21:22: socket_id - */ - core_id = get_bit_value(mpidr, 8, 8); -+ cluster_id = get_bit_value(mpidr, 16, 3); - socket_id = get_bit_value(mpidr, 21, 2); - die_id = get_bit_value(mpidr, 19, 2); -- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die; -+ -+ /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3, -+ * it means TotemB. When cores per die equal to cores per socket, it means -+ * that there is only one die in the socket, in case that the only die is -+ * TotemB in CPU 1620s, we set die id to 0 directly. -+ */ -+ if (cores_per_die == cores_per_socket) { -+ die_id = 0; -+ } -+ else { -+ die_id = (die_id == 1 ? 0:1); -+ } -+ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die + -+ cluster_id * cores_per_cluster; - - return cpu; - } --- -2.27.0 - diff --git a/rasdaemon.spec b/rasdaemon.spec index 0a21c3f..62576b1 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 1 +Release: 4 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -23,13 +23,18 @@ Patch1: bugfix-rasdaemon-wait-for-file-access.patch Patch2: bugfix-fix-fd-check.patch Patch3: bugfix-fix-disk-error-log-storm.patch Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch -Patch5: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch -Patch6: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch -Patch7: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch -Patch8: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch -Patch9: 0006-add-cpu-online-fault-isolation.patch -Patch10: 0007-add-trace-print-and-add-sqlite-store.patch -Patch11: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch +Patch5: 0001-Support-cpu-fault-isolation-for-corrected-errors.patch +Patch6: 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch +Patch7: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch +Patch8: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch +Patch9: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch +Patch10: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch +Patch11: 0001-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +Patch12: 0002-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +Patch13: 0003-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +Patch14: 0004-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +Patch15: 0005-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +Patch16: 0006-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch %description The rasdaemon program is a daemon which monitors the platform @@ -75,41 +80,43 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog -* Mon Jan 17 2022 xujing - 0.6.7-1 -- DESC: Update software to v0.6.7 - -* Thu Dec 9 2021 tanxiaofei - 0.6.6-10 +* Mon Mar 07 2022 Shiju Jose - 0.6.7-4 - Type:feature - ID:NA - SUG:NA -- DESC: Enable compilation of the feature memory fault prediction based on - corrected error. - -* Thu Dec 2 2021 tanxiaofei - 0.6.6-9 -- Type:feature +- DESC: + 1. Modify recording Hisilicon common error data in the rasdaemon and + 2. In the ras-mc-ctl, + 2.1. Improve Hisilicon common error statistics. + 2.2. Add support to display the HiSilicon vendor-errors for a specified module. + 2.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options. + 2.4. Reformat error info of the HiSilicon Kunpeng920. + 2.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. + +* Wed Mar 2 2022 tanxiaofei - 0.6.7-3 +- Type:bugfix - ID:NA - SUG:NA -- DESC: Backport memory failure feature, one patch. +- DESC: + 1. Backport 4 patches from openEuler master branch. + 1) Fix the issue of sprintf data type mismatch in uuid_le() + 2) Fix the issue of command option -r for hip08 + 3) Fix some print format issues for hisi common error section + 4) Add some modules supported by hisi common error section + 2.Enable compilation of the feature memory fault prediction based on + corrected error. + 3.Fix changelog date error of this spec file. -* Wed Oct 27 2021 luoshengwei - 0.6.6-8 +* Wed Feb 23 2022 luoshengwei - 0.6.7-2 - Type:feature - ID:NA - SUG:NA -- DESC: Sync three patches, add cpu online fault isolation. +- DESC: Add cpu online fault isolation for arm event. -* Wed Oct 20 2021 tanxiaofei - 0.6.6-7 -- Type:Bugfix -- ID:NA -- SUG:NA -- DESC: Backport one patch, and some little fixes and add some modules - support for kunpeng series: - 1. Modify non-standard error decoding interface using linked list - 2. Fix the issue of sprintf data type mismatch in uuid_le() - 3. Fix the issue of command option -r for hip08 - 4. Fix some print format issues for hisi common error section - 5. Add some modules supported by hisi common error section - -* Sat July 29 2021 tanxiaofei - 0.6.6-6 +* Wed Dec 8 2021 xujing - 0.6.7-1 +- Update software to v0.6.7 + +* Thu Jul 29 2021 tanxiaofei - 0.6.6-6 - Type:feature - ID:NA - SUG:NA -- Gitee