diff --git a/0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch b/0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch new file mode 100644 index 0000000000000000000000000000000000000000..64f60f6b8f315fc823b2cc72aa427e0d094d0649 --- /dev/null +++ b/0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch @@ -0,0 +1,165 @@ +From e53389e7d7bd805900386b979fb3d48f1e79a7bc Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sun, 5 Mar 2023 23:14:42 +0000 +Subject: [PATCH] rasdaemon: Fix for regression in ras_mc_create_table() if + some cpus are offline at the system start + +Issues: +Regression in the ras_mc_create_table() if some of the cpus are offline +at the system start when run the rasdaemon. This issue is +reproducible in ras_mc_create_table() with decode and record +non-standard events and reproducible sometimes with +ras_mc_create_table() for the standard events. +Also in the multi thread way, there is memory leak in ras_mc_event_opendb() +as struct sqlite3_priv *priv and sqlite3 *db allocated/initialized per +thread, but stored in the common struct ras_events ras in pthread data, +which is shared across the threads. + +Reason: +when the system start with some of the cpus are offline and then run +the rasdaemon, read_ras_event_all_cpus() exit with error and switch to +the multi thread way. However read() in read_ras_event() return error in +threads for each of the offline CPUs and does clean up including calling +ras_mc_event_closedb(). +Since the 'struct ras_events ras' passed in the pthread_data to each of the +threads is common, struct sqlite3_priv *priv and sqlite3 *db allocated/ +initialized per thread and stored in the common 'struct ras_events ras', +are getting overwritten in each ras_mc_event_opendb()(which called from +pthread per cpu), result memory leak. Also when ras_mc_event_closedb() +is called in the above error case from the threads corresponding to the +offline cpus, close the sqlite3 *db and free sqlite3_priv *priv stored +in the common 'struct ras_events ras', result regression when accessing +priv->db in the ras_mc_create_table() from another context later. + +Proposed solution: +In ras_mc_event_opendb(), allocate struct sqlite3_priv *priv, +init sqlite3 *db and create tables common for the threads with shared +'struct ras_events ras' based on a reference count and free them in the +same way. +Also protect critical code ras_mc_event_opendb() and ras_mc_event_closedb() +using mutex in the multi thread case from any regression caused by the +thread pre-emption. + +Reported-by: Lei Feng +Signed-off-by: Shiju Jose +--- + ras-events.c | 16 +++++++++++++++- + ras-events.h | 4 +++- + ras-record.c | 12 ++++++++++++ + 3 files changed, 30 insertions(+), 2 deletions(-) + +diff --git a/ras-events.c b/ras-events.c +index 49e4f9a..5fe8e19 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -625,19 +625,25 @@ static void *handle_ras_events_cpu(void *priv) + + log(TERM, LOG_INFO, "Listening to events on cpu %d\n", pdata->cpu); + if (pdata->ras->record_events) { ++ pthread_mutex_lock(&pdata->ras->db_lock); + if (ras_mc_event_opendb(pdata->cpu, pdata->ras)) { ++ pthread_mutex_unlock(&pdata->ras->db_lock); + log(TERM, LOG_ERR, "Can't open database\n"); + close(fd); + kbuffer_free(kbuf); + free(page); + return 0; + } ++ pthread_mutex_unlock(&pdata->ras->db_lock); + } + + read_ras_event(fd, pdata, kbuf, page); + +- if (pdata->ras->record_events) ++ if (pdata->ras->record_events) { ++ pthread_mutex_lock(&pdata->ras->db_lock); + ras_mc_event_closedb(pdata->cpu, pdata->ras); ++ pthread_mutex_unlock(&pdata->ras->db_lock); ++ } + + close(fd); + kbuffer_free(kbuf); +@@ -993,6 +999,11 @@ int handle_ras_events(int record_events) + + /* Poll doesn't work on this kernel. Fallback to pthread way */ + if (rc == -255) { ++ if (pthread_mutex_init(&ras->db_lock, NULL) != 0) { ++ log(SYSLOG, LOG_INFO, "sqlite db lock init has failed\n"); ++ goto err; ++ } ++ + log(SYSLOG, LOG_INFO, + "Opening one thread per cpu (%d threads)\n", cpus); + for (i = 0; i < cpus; i++) { +@@ -1005,6 +1016,8 @@ int handle_ras_events(int record_events) + i); + while (--i) + pthread_cancel(data[i].thread); ++ ++ pthread_mutex_destroy(&ras->db_lock); + goto err; + } + } +@@ -1012,6 +1025,7 @@ int handle_ras_events(int record_events) + /* Wait for all threads to complete */ + for (i = 0; i < cpus; i++) + pthread_join(data[i].thread, NULL); ++ pthread_mutex_destroy(&ras->db_lock); + } + + log(SYSLOG, LOG_INFO, "Huh! something got wrong. Aborting.\n"); +diff --git a/ras-events.h b/ras-events.h +index 6c9f507..649b0c0 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -56,7 +56,9 @@ struct ras_events { + time_t uptime_diff; + + /* For ras-record */ +- void *db_priv; ++ void *db_priv; ++ int db_ref_count; ++ pthread_mutex_t db_lock; + + /* For the mce handler */ + struct mce_priv *mce_priv; +diff --git a/ras-record.c b/ras-record.c +index a367939..adc97a4 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -763,6 +763,10 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + + printf("Calling %s()\n", __FUNCTION__); + ++ ras->db_ref_count++; ++ if (ras->db_ref_count > 1) ++ return 0; ++ + ras->db_priv = NULL; + + priv = calloc(1, sizeof(*priv)); +@@ -912,6 +916,13 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + + printf("Calling %s()\n", __func__); + ++ if (ras->db_ref_count > 0) ++ ras->db_ref_count--; ++ else ++ return -1; ++ if (ras->db_ref_count > 0) ++ return 0; ++ + if (!priv) + return -1; + +@@ -1018,6 +1029,7 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + log(TERM, LOG_ERR, + "cpu %u: Failed to shutdown sqlite: error = %d\n", cpu, rc); + free(priv); ++ ras->db_priv = NULL; + + return 0; + } +-- +2.25.1 + diff --git a/0006-add-cpu-online-fault-isolation.patch b/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch similarity index 31% rename from 0006-add-cpu-online-fault-isolation.patch rename to 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch index b796a48845963d1c62298e5f372bc1045b4496a5..d17fb219aa2ea0c9f854ef889e2064511c981b27 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch @@ -1,50 +1,40 @@ -From 94f9581a6b398f178fcabf0fde2cce7eebb15ea7 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Tue, 25 May 2021 20:05:49 +0800 -Subject: [PATCH 1/2] add cpu online fault isolation +From b9999d40d73dfff8b1cfb515f3b81b2c2891f6a7 Mon Sep 17 00:00:00 2001 +From: Shengwei Luo +Date: Wed, 23 Feb 2022 17:21:58 +0800 +Subject: [PATCH 01/10] rasdaemon: Support cpu fault isolation for corrected + errors -Add cpu online fault isolation, when CE/UCE occurs, we choose to offline -the error cpu according to threshold algorithm. +When the corrected errors exceed the set limit in cycle, try to +offline the related cpu core. -Signed-off-by: Luo Shengwei +Signed-off-by: Shengwei Luo +Signed-off-by: Junchong Pan +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose --- - .travis.yml | 2 +- Makefile.am | 6 +- - configure.ac | 11 + + configure.ac | 11 ++ misc/rasdaemon.env | 17 ++ - queue.c | 126 +++++++++++ - queue.h | 43 ++++ - ras-arm-handler.c | 73 +++++++ - ras-cpu-isolation.c | 499 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 76 +++++++ - ras-events.c | 8 + - ras-record.h | 5 + - 11 files changed, 864 insertions(+), 2 deletions(-) + queue.c | 119 ++++++++++++++ + queue.h | 39 +++++ + ras-arm-handler.c | 97 +++++++++++ + ras-arm-handler.h | 18 ++ + ras-cpu-isolation.c | 388 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 68 ++++++++ + ras-events.c | 9 +- + 10 files changed, 770 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c create mode 100644 ras-cpu-isolation.h -diff --git a/.travis.yml b/.travis.yml -index 79cf4ca..5ab3957 100644 ---- a/.travis.yml -+++ b/.travis.yml -@@ -20,7 +20,7 @@ before_install: - - sudo apt-get install -y sqlite3 - install: - - autoreconf -vfi --- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa -+- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa --enable-cpu-fault-isolation - - script: - - make && sudo make install diff --git a/Makefile.am b/Makefile.am -index f4822b9..6431dd3 100644 +index a322b9a..36e7d4e 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -57,12 +57,16 @@ endif - if WITH_MEMORY_CE_PFA - rasdaemon_SOURCES += rbtree.c ras-page-isolation.c +@@ -69,13 +69,17 @@ endif + if WITH_AMP_NS_DECODE + rasdaemon_SOURCES += non-standard-ampere.c endif +if WITH_CPU_FAULT_ISOLATION + rasdaemon_SOURCES += ras-cpu-isolation.c queue.c @@ -54,19 +44,20 @@ index f4822b9..6431dd3 100644 include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ -- ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h -+ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ +- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h ++ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cpu-isolation.h queue.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac -index 2d6c59c..a682bb9 100644 +index a77991f..e0ed751 100644 --- a/configure.ac +++ b/configure.ac -@@ -141,6 +141,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], - AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all == xyes]) - AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"]) +@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], + AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) +AC_ARG_ENABLE([cpu_fault_isolation], + AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation])) @@ -81,14 +72,14 @@ index 2d6c59c..a682bb9 100644 test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -173,4 +183,5 @@ compile time options summary - DEVLINK : $USE_DEVLINK - Disk I/O errors : $USE_DISKERROR +@@ -201,4 +211,5 @@ compile time options summary + Memory Failure : $USE_MEMORY_FAILURE Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..3191d03 100644 +index 12fd766..7cb18e8 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env @@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" @@ -112,12 +103,13 @@ index 12fd766..3191d03 100644 + +# Prevent excessive isolation from causing an avalanche effect +CPU_ISOLATION_LIMIT="10" +\ No newline at end of file diff --git a/queue.c b/queue.c new file mode 100644 -index 0000000..92f3d3c +index 0000000..65b6fb8 --- /dev/null +++ b/queue.c -@@ -0,0 +1,126 @@ +@@ -0,0 +1,119 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -130,126 +122,119 @@ index 0000000..92f3d3c + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ +#include +#include +#include "queue.h" +#include "ras-logger.h" + -+ +int is_empty(struct link_queue *queue) +{ -+ if (queue) { -+ return queue->size == 0; -+ } ++ if (queue) ++ return queue->size == 0; + -+ return 1; ++ return 1; +} + -+struct link_queue* init_queue(void) ++struct link_queue *init_queue(void) +{ -+ struct link_queue* queue; -+ queue = (struct link_queue*) malloc(sizeof(struct link_queue)); ++ struct link_queue *queue = NULL; + -+ if (queue == NULL) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); -+ return NULL; -+ } ++ queue = (struct link_queue *)malloc(sizeof(struct link_queue)); ++ if (queue == NULL) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); ++ return NULL; ++ } + -+ queue->size = 0; -+ queue->head = NULL; -+ queue->tail = NULL; ++ queue->size = 0; ++ queue->head = NULL; ++ queue->tail = NULL; + -+ return queue; ++ return queue; +} + +void clear_queue(struct link_queue *queue) +{ -+ if (queue == NULL) { -+ return; -+ } -+ -+ struct queue_node *node = queue->head; -+ struct queue_node *tmp = NULL; -+ -+ while (node != NULL) { -+ tmp = node; -+ node = node->next; -+ free(tmp); -+ } -+ -+ queue->head = NULL; -+ queue->tail = NULL; -+ queue->size = 0; ++ if (queue == NULL) ++ return; ++ ++ struct queue_node *node = queue->head; ++ struct queue_node *tmp = NULL; ++ ++ while (node != NULL) { ++ tmp = node; ++ node = node->next; ++ free(tmp); ++ } ++ ++ queue->head = NULL; ++ queue->tail = NULL; ++ queue->size = 0; +} + -+void free_queue(struct link_queue *queue) { -+ clear_queue(queue); ++void free_queue(struct link_queue *queue) ++{ ++ clear_queue(queue); + -+ if (queue) { -+ free(queue); -+ } ++ if (queue) ++ free(queue); +} + +/* It should be guranteed that the param is not NULL */ +void push(struct link_queue *queue, struct queue_node *node) +{ -+ /* there is no element in the queue */ -+ if (queue->head == NULL) { -+ queue->head = node; -+ } -+ else { -+ node->next = queue->tail->next; -+ queue->tail->next = node; -+ } -+ -+ queue->tail = node; -+ (queue->size)++; ++ /* there is no element in the queue */ ++ if (queue->head == NULL) ++ queue->head = node; ++ else ++ queue->tail->next = node; ++ ++ queue->tail = node; ++ (queue->size)++; +} + +int pop(struct link_queue *queue) +{ -+ if (queue == NULL || is_empty(queue)) { -+ return -1; -+ } ++ struct queue_node *tmp = NULL; + -+ struct queue_node *tmp = NULL; -+ tmp = queue->head; -+ queue->head = queue->head->next; -+ free(tmp); -+ (queue->size)--; ++ if (queue == NULL || is_empty(queue)) ++ return -1; ++ ++ tmp = queue->head; ++ queue->head = queue->head->next; ++ free(tmp); ++ (queue->size)--; + -+ return 0; ++ return 0; +} + -+struct queue_node* front(struct link_queue *queue) ++struct queue_node *front(struct link_queue *queue) +{ -+ if (queue == NULL) { -+ return NULL; -+ } ++ if (queue == NULL) ++ return NULL; + -+ return queue->head; ++ return queue->head; +} + -+struct queue_node* node_create(time_t time, unsigned value) ++struct queue_node *node_create(time_t time, unsigned int value) +{ -+ struct queue_node *node = NULL; -+ node = (struct queue_node*) malloc(sizeof(struct queue_node)); ++ struct queue_node *node = NULL; + -+ if (node != NULL) { -+ node->time = time; -+ node->value = value; -+ node->next = NULL; -+ } ++ node = (struct queue_node *)malloc(sizeof(struct queue_node)); ++ if (node != NULL) { ++ node->time = time; ++ node->value = value; ++ node->next = NULL; ++ } + -+ return node; ++ return node; +} diff --git a/queue.h b/queue.h new file mode 100644 -index 0000000..9684c58 +index 0000000..5459f40 --- /dev/null +++ b/queue.h -@@ -0,0 +1,43 @@ +@@ -0,0 +1,39 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -262,135 +247,197 @@ index 0000000..9684c58 + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ + +#ifndef __RAS_QUEUE_H +#define __RAS_QUEUE_H + -+ -+struct queue_node -+{ -+ time_t time; -+ unsigned value; -+ struct queue_node *next; ++struct queue_node { ++ time_t time; ++ unsigned int value; ++ struct queue_node *next; +}; + -+struct link_queue -+{ -+ struct queue_node *head; -+ struct queue_node *tail; -+ int size; ++struct link_queue { ++ struct queue_node *head; ++ struct queue_node *tail; ++ int size; +}; + +int is_empty(struct link_queue *queue); -+struct link_queue* init_queue(void); ++struct link_queue *init_queue(void); +void clear_queue(struct link_queue *queue); +void free_queue(struct link_queue *queue); +void push(struct link_queue *queue, struct queue_node *node); +int pop(struct link_queue *queue); -+struct queue_node* front(struct link_queue *queue); -+struct queue_node* node_create(time_t time, unsigned value); -+ ++struct queue_node *front(struct link_queue *queue); ++struct queue_node *node_create(time_t time, unsigned int value); + +#endif -\ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 2f170e2..10d0099 100644 +index 1149dc6..9c7a3c3 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -20,6 +20,44 @@ - #include "ras-record.h" - #include "ras-logger.h" +@@ -22,6 +22,10 @@ #include "ras-report.h" + #include "ras-non-standard-handler.h" + #include "non-standard-ampere.h" +#include "ras-cpu-isolation.h" + ++#define ARM_ERR_VALID_ERROR_COUNT BIT(0) ++#define ARM_ERR_VALID_FLAGS BIT(1) + + void display_raw_data(struct trace_seq *s, + const uint8_t *buf, +@@ -42,6 +46,93 @@ void display_raw_data(struct trace_seq *s, + } + } + +#ifdef HAVE_CPU_FAULT_ISOLATION -+static int is_core_failure(unsigned long value) -+{ -+ /* -+ * core failure: -+ * Bit 0\1\3: (at lease 1) -+ * Bit 2: 0 -+ */ -+ return (value & 0xf) && !(value & (0x1 << 2)); -+} -+ -+static int count_errors(struct event_format *event, const uint8_t *data, int len) ++static int count_errors(struct ras_arm_event *ev) +{ -+ /* -+ * According to UEFI_2_9_2021_03_18 specification chapter N2.4.4, -+ * the length of struct processor error information is 32, the byte -+ * length of the Flags field is 1, and the byte offset is 7 in the struct. -+ */ -+ int cur_offset = 7; -+ unsigned long value; ++ struct ras_arm_err_info *err_info; ++ int num_pei; ++ int err_info_size = sizeof(struct ras_arm_err_info); + int num = 0; -+ if (len % PEI_ERR_SIZE != 0) { -+ log(TERM, LOG_ERR, "the event data does not match to the ARM Processor Error Information Structure\n"); ++ int i; ++ int error_count; ++ ++ if (ev->pei_len % err_info_size != 0) { ++ log(TERM, LOG_ERR, ++ "The event data does not match to the ARM Processor Error Information Structure\n"); + return num; + } -+ while (cur_offset < len) { -+ value = pevent_read_number(event->pevent, data+cur_offset, FLAGS_SIZE); -+ if (is_core_failure(value)) { -+ num++; -+ log(TERM, LOG_INFO, "Error in cpu core catched\n"); ++ num_pei = ev->pei_len / err_info_size; ++ err_info = (struct ras_arm_err_info *)(ev->pei_error); ++ ++ for (i = 0; i < num_pei; ++i) { ++ error_count = 1; ++ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) { ++ /* ++ * The value of this field is defined as follows: ++ * 0: Single Error ++ * 1: Multiple Errors ++ * 2-65535: Error Count ++ */ ++ error_count = err_info->multiple_error + 1; + } -+ cur_offset += PEI_ERR_SIZE; ++ ++ num += error_count; ++ err_info += 1; + } ++ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num); + return num; +} -+#endif - - int ras_arm_event_handler(struct trace_seq *s, - struct pevent_record *record, -@@ -78,6 +116,41 @@ int ras_arm_event_handler(struct trace_seq *s, - ev.psci_state = val; - trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); - -+#ifdef HAVE_CPU_FAULT_ISOLATION ++ ++static int ras_handle_cpu_error(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, ++ struct ras_arm_event *ev, time_t now) ++{ ++ unsigned long long val; ++ int cpu; ++ char *severity; ++ struct error_info err_info; ++ ++ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) ++ return -1; ++ cpu = val; ++ trace_seq_printf(s, "\n cpu: %d", cpu); ++ + /* record cpu error */ + if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) + return -1; -+ /* refer to UEFI_2_9_2021_03_18 specification chapter N2.2 Table N-5 */ ++ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */ + switch (val) { + case GHES_SEV_NO: -+ ev.severity = "Informational"; ++ severity = "Informational"; + break; + case GHES_SEV_CORRECTED: -+ ev.severity = "Corrected"; ++ severity = "Corrected"; + break; + case GHES_SEV_RECOVERABLE: -+ ev.severity = "Recoverable"; ++ severity = "Recoverable"; + break; + default: + case GHES_SEV_PANIC: -+ ev.severity = "Fatal"; ++ severity = "Fatal"; + } ++ trace_seq_printf(s, "\n severity: %s", severity); ++ ++ if (val == GHES_SEV_CORRECTED) { ++ int nums = count_errors(ev); + -+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { -+ int len, nums; -+ ev.error_info = pevent_get_field_raw(s, event, "buf", record, &len, 1); -+ if (!ev.error_info) -+ return -1; -+ ev.length = len; -+ /* relate to enum error_type */ -+ nums = count_errors(event, ev.error_info, len); + if (nums > 0) { -+ struct error_info err_info = {nums, now, val}; -+ ras_record_cpu_error(&err_info, ev.mpidr); ++ err_info.nums = nums; ++ err_info.time = now; ++ err_info.err_type = val; ++ ras_record_cpu_error(&err_info, cpu); + } + } ++ ++ return 0; ++} ++#endif ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context) +@@ -52,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_arm_event ev; + int len = 0; ++ + memset(&ev, 0, sizeof(ev)); + + /* +@@ -139,6 +231,11 @@ int ras_arm_event_handler(struct trace_seq *s, + display_raw_data(s, ev.vsei_error, ev.oem_len); + #endif + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0) ++ return -1; +#endif + /* Insert data into the SGBD */ #ifdef HAVE_SQLITE3 ras_store_arm_record(ras, &ev); +diff --git a/ras-arm-handler.h b/ras-arm-handler.h +index 563a2d3..52813e7 100644 +--- a/ras-arm-handler.h ++++ b/ras-arm-handler.h +@@ -17,6 +17,24 @@ + #include "ras-events.h" + #include "libtrace/event-parse.h" + ++/* ++ * ARM Processor Error Information Structure, According to ++ * UEFI_2_9 specification chapter N2.4.4. ++ */ ++#pragma pack(1) ++struct ras_arm_err_info { ++ uint8_t version; ++ uint8_t length; ++ uint16_t validation_bits; ++ uint8_t type; ++ uint16_t multiple_error; ++ uint8_t flags; ++ uint64_t error_info; ++ uint64_t virt_fault_addr; ++ uint64_t physical_fault_addr; ++}; ++#pragma pack() ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..a809f91 +index 0000000..abcf451 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,499 @@ +@@ -0,0 +1,388 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -403,7 +450,7 @@ index 0000000..a809f91 + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ + +#include +#include @@ -416,486 +463,375 @@ index 0000000..a809f91 +#include "ras-logger.h" +#include "ras-cpu-isolation.h" + -+static struct cpu_info *cpu_infos = NULL; -+static unsigned int ncores, cores_per_socket, cores_per_die; -+static unsigned int sockets, dies = 1; ++#define SECOND_OF_MON (30 * 24 * 60 * 60) ++#define SECOND_OF_DAY (24 * 60 * 60) ++#define SECOND_OF_HOU (60 * 60) ++#define SECOND_OF_MIN (60) ++ ++#define LIMIT_OF_CPU_THRESHOLD 10000 ++#define INIT_OF_CPU_THRESHOLD 18 ++#define DEC_CHECK 10 ++#define LAST_BIT_OF_UL 5 ++ ++static struct cpu_info *cpu_infos; ++static unsigned int ncores; +static unsigned int enabled = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -+static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; -+static const char *node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { -+ { "", 1 }, -+ {} ++ {"", 1}, ++ {} +}; + +static const struct param cycle_units[] = { -+ { "d", 24 * 60 * 60 }, -+ { "h", 60 * 60 }, -+ { "m", 60 }, -+ { "s", 1 }, -+ {} ++ {"d", SECOND_OF_DAY}, ++ {"h", SECOND_OF_HOU}, ++ {"m", SECOND_OF_MIN}, ++ {"s", 1}, ++ {} +}; + +static struct isolation_param threshold = { -+ .name = "CPU_CE_THRESHOLD", -+ .units = normal_units, -+ .value = 18, -+ .limit = 10000 ++ .name = "CPU_CE_THRESHOLD", ++ .units = normal_units, ++ .value = INIT_OF_CPU_THRESHOLD, ++ .limit = LIMIT_OF_CPU_THRESHOLD +}; + +static struct isolation_param cpu_limit = { -+ .name = "CPU_ISOLATION_LIMIT", -+ .units = normal_units ++ .name = "CPU_ISOLATION_LIMIT", ++ .units = normal_units +}; + +static struct isolation_param cycle = { -+ .name = "CPU_ISOLATION_CYCLE", -+ .units = cycle_units, -+ .value = 24 * 60 * 60, -+ .limit = 30 * 24 * 60 * 60 ++ .name = "CPU_ISOLATION_CYCLE", ++ .units = cycle_units, ++ .value = SECOND_OF_DAY, ++ .limit = SECOND_OF_MON +}; + -+static const char *cpu_state[] = { -+ [CPU_OFFLINE] = "offline", -+ [CPU_ONLINE] = "online", -+ [CPU_OFFLINE_FAILED] = "offline-failed", -+ [CPU_UNKNOWN] = "unknown" ++static const char * const cpu_state[] = { ++ [CPU_OFFLINE] = "offline", ++ [CPU_ONLINE] = "online", ++ [CPU_OFFLINE_FAILED] = "offline-failed", ++ [CPU_UNKNOWN] = "unknown" +}; + -+static int open_sys_file(unsigned cpu, int __oflag, const char *format) ++static int open_sys_file(unsigned int cpu, int __oflag, const char *format) +{ -+ int fd; -+ char buf[MAX_PATH_LEN] = ""; -+ snprintf(buf, sizeof(buf), format, cpu); -+ fd = open(buf, __oflag); -+ -+ if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); -+ return -1; -+ } ++ int fd; ++ char path[MAX_PATH_LEN + 1] = ""; ++ char real_path[MAX_PATH_LEN + 1] = ""; + -+ return fd; -+} ++ snprintf(path, sizeof(path), format, cpu); ++ if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path); ++ return -1; ++ } ++ fd = open(real_path, __oflag); ++ if (fd == -1) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, real_path); ++ return -1; ++ } + -+static int get_sockets(void) -+{ -+ int fd, j; -+ char buf[MAX_BUF_LEN] = ""; -+ cores_per_socket = ncores; -+ struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); -+ -+ if (!cpu_sets) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); -+ return -1; -+ } -+ -+ for (int i = 0; i < ncores; ++i) { -+ fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); -+ if (fd == -1) { -+ continue; -+ } -+ memset(buf, '\0', strlen(buf)); -+ if (read(fd, buf, sizeof(buf)) <= 0) { -+ close(fd); -+ continue; -+ } -+ for (j = 0; j < sockets; ++j) { -+ if (strcmp(cpu_sets[j].buf, buf) == 0) { -+ break; -+ } -+ } -+ if (j == sockets) { -+ strcpy(cpu_sets[sockets].buf, buf); -+ sockets++; -+ } -+ close(fd); -+ } -+ -+ free(cpu_sets); -+ cores_per_socket = sockets > 0 ? ncores / sockets : ncores; -+ -+ return 0; ++ return fd; +} + -+static int get_dies(void) ++static int get_cpu_status(unsigned int cpu) +{ -+ int fd, begin, end; -+ char buf[20] = ""; -+ cores_per_die = ncores; -+ fd = open(node_path, O_RDONLY); ++ int fd, num; ++ char buf[2] = ""; + -+ if (fd == -1) { -+ return -1; -+ } ++ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); ++ if (fd == -1) ++ return CPU_UNKNOWN; + -+ if (read(fd, buf, sizeof(buf))) { -+ if (sscanf(buf, "%d-%d", &begin, &end) == 2) { -+ dies = end > begin ? end - begin + 1 : 1; -+ } -+ } ++ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) ++ num = CPU_UNKNOWN; + -+ close(fd); -+ cores_per_die = ncores / dies; ++ close(fd); + -+ return 0; ++ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; +} + -+static int get_cpu_status(unsigned cpu) ++static int init_cpu_info(unsigned int cpus) +{ -+ int fd, num; -+ char buf[2] = ""; -+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); -+ -+ if (fd == -1) { -+ return CPU_UNKNOWN; -+ } -+ -+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) { -+ num = CPU_UNKNOWN; -+ } ++ ncores = cpus; ++ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); ++ if (!cpu_infos) { ++ log(TERM, LOG_ERR, ++ "Failed to allocate memory for cpu infos in %s.\n", __func__); ++ return -1; ++ } + -+ close(fd); ++ for (unsigned int i = 0; i < cpus; ++i) { ++ cpu_infos[i].ce_nums = 0; ++ cpu_infos[i].state = get_cpu_status(i); ++ cpu_infos[i].ce_queue = init_queue(); + -+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; -+} ++ if (cpu_infos[i].ce_queue == NULL) { ++ log(TERM, LOG_ERR, ++ "Failed to allocate memory for cpu ce queue in %s.\n", __func__); ++ return -1; ++ } ++ } ++ /* set limit of offlined cpu limit according to number of cpu */ ++ cpu_limit.limit = cpus - 1; ++ cpu_limit.value = 0; + -+static int init_cpu_info(unsigned cpus) -+{ -+ ncores = cpus; -+ cpu_infos = (struct cpu_info *) malloc(sizeof(*cpu_infos) * cpus); -+ -+ if (!cpu_infos) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for cpu infos in %s.\n", __func__); -+ return -1; -+ } -+ -+ for (unsigned int i = 0; i < cpus; ++i) { -+ cpu_infos[i].state = get_cpu_status(i); -+ cpu_infos[i].ce_queue = init_queue(); -+ if (cpu_infos[i].ce_queue == NULL) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for cpu ce queue in %s.\n", __func__); -+ return -1; -+ } -+ } -+ /* set limit of offlined cpu limit according to number of cpu */ -+ cpu_limit.limit = cpus - 1; -+ cpu_limit.value = 0; -+ -+ if (get_sockets() < 0 || get_dies() < 0) { -+ log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); -+ return -1; -+ } -+ -+ return 0; ++ return 0; +} + +static void check_config(struct isolation_param *config) +{ -+ if (config->value > config->limit) { -+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", -+ config->value, config->limit); -+ config->value = config->limit; -+ } ++ if (config->value > config->limit) { ++ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", ++ config->value, config->limit); ++ config->value = config->limit; ++ } +} + +static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) +{ -+ int env_size, has_unit = 0; -+ -+ if (!env || strlen(env) == 0) { -+ return -1; -+ } -+ -+ env_size = strlen(env); -+ char *unit = NULL; -+ unit = env + env_size - 1; -+ -+ if (isalpha(*unit)) { -+ has_unit = 1; -+ env_size--; -+ if (env_size <= 0) { -+ return -1; -+ } -+ } -+ -+ for (int i = 0; i < env_size; ++i) { -+ if (isdigit(env[i])) { -+ if (*value > ULONG_MAX / 10 || (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { -+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = 10 * (*value) + (env[i] - '0'); -+ } -+ else { -+ return -1; -+ } -+ } -+ -+ if (has_unit) { -+ for (const struct param *units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ if (*value > (ULONG_MAX / units->value)) { -+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = (*value) * units->value; -+ return 0; -+ } -+ } -+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); -+ return -1; -+ } -+ -+ return 0; ++ char *unit = NULL; ++ int env_size, has_unit = 0; ++ ++ if (!env || strlen(env) == 0) ++ return -1; ++ ++ env_size = strlen(env); ++ unit = env + env_size - 1; ++ ++ if (isalpha(*unit)) { ++ has_unit = 1; ++ env_size--; ++ if (env_size <= 0) ++ return -1; ++ } ++ ++ for (int i = 0; i < env_size; ++i) { ++ if (isdigit(env[i])) { ++ if (*value > ULONG_MAX / DEC_CHECK || ++ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) { ++ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = DEC_CHECK * (*value) + (env[i] - '0'); ++ } else ++ return -1; ++ } ++ ++ if (!has_unit) ++ return 0; ++ ++ for (const struct param *units = config->units; units->name; units++) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ if (*value > (ULONG_MAX / units->value)) { ++ log(TERM, LOG_ERR, ++ "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = (*value) * units->value; ++ return 0; ++ } ++ } ++ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); ++ return -1; +} + +static void init_config(struct isolation_param *config) +{ -+ char *env = getenv(config->name); -+ unsigned long value = 0; ++ char *env = getenv(config->name); ++ unsigned long value = 0; + -+ if (parse_ul_config(config, env, &value) < 0) { -+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", -+ config->name, env, config->value); -+ return; -+ } ++ if (parse_ul_config(config, env, &value) < 0) { ++ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n", ++ config->name, env, config->value); ++ return; ++ } + -+ config->value = value; -+ check_config(config); ++ config->value = value; ++ check_config(config); +} + +static int check_config_status(void) +{ -+ char *env = getenv("CPU_ISOLATION_ENABLE"); ++ char *env = getenv("CPU_ISOLATION_ENABLE"); + -+ if (env == NULL || strcasecmp(env, "yes")) { -+ return -1; -+ } ++ if (env == NULL || strcasecmp(env, "yes")) ++ return -1; + -+ return 0; ++ return 0; +} + -+void ras_error_count_init(unsigned cpus) ++void ras_cpu_isolation_init(unsigned int cpus) +{ -+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { -+ enabled = 0; -+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); -+ init_config(&threshold); -+ init_config(&cpu_limit); -+ init_config(&cycle); ++ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { ++ enabled = 0; ++ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); ++ return; ++ } ++ ++ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); ++ init_config(&threshold); ++ init_config(&cpu_limit); ++ init_config(&cycle); +} + +void cpu_infos_free(void) +{ -+ if (cpu_infos) { -+ for (int i = 0; i < ncores; ++i) { -+ free_queue(cpu_infos[i].ce_queue); -+ } -+ free(cpu_infos); -+ } ++ if (cpu_infos) { ++ for (int i = 0; i < ncores; ++i) ++ free_queue(cpu_infos[i].ce_queue); ++ ++ free(cpu_infos); ++ } +} + -+static int do_cpu_offline(unsigned cpu) ++static int do_cpu_offline(unsigned int cpu) +{ -+ int fd, rc; -+ char buf[2] = ""; -+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; -+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); -+ -+ if (fd == -1) { -+ return HANDLE_FAILED; -+ } -+ -+ strcpy(buf, "0"); -+ rc = write(fd, buf, strlen(buf)); -+ -+ if (rc < 0) { -+ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); -+ close(fd); -+ return HANDLE_FAILED; -+ } ++ int fd, rc; ++ char buf[2] = ""; ++ ++ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; ++ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); ++ if (fd == -1) ++ return HANDLE_FAILED; ++ ++ strcpy(buf, "0"); ++ rc = write(fd, buf, strlen(buf)); ++ if (rc < 0) { ++ log(TERM, LOG_ERR, "cpu%u offline failed, errno:%d\n", cpu, errno); ++ close(fd); ++ return HANDLE_FAILED; ++ } + -+ close(fd); -+ /* check wthether the cpu is isolated successfully */ -+ cpu_infos[cpu].state = get_cpu_status(cpu); ++ close(fd); ++ /* check wthether the cpu is isolated successfully */ ++ cpu_infos[cpu].state = get_cpu_status(cpu); + -+ if (cpu_infos[cpu].state == CPU_OFFLINE) { -+ return HANDLE_SUCCEED; -+ } ++ if (cpu_infos[cpu].state == CPU_OFFLINE) ++ return HANDLE_SUCCEED; + -+ return HANDLE_FAILED; ++ return HANDLE_FAILED; +} + -+static int do_ce_handler(unsigned cpu) ++static int do_ce_handler(unsigned int cpu) +{ -+ struct link_queue *queue = cpu_infos[cpu].ce_queue; -+ unsigned tmp; -+ /* -+ * Since we just count all error numbers in setted cycle, we store the time -+ * and error numbers from current event to the queue, then everytime we -+ * calculate the period from beginning time to ending time, if the period -+ * exceeds setted cycle, we pop the beginning time and error until the period -+ * from new beginning time to ending time is less than cycle. -+ */ -+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { -+ tmp = queue->head->value; -+ if (pop(queue) == 0) { -+ cpu_infos[cpu].ce_nums -= tmp; -+ } -+ } -+ -+ if (cpu_infos[cpu].ce_nums >= threshold.value) { -+ log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", -+ threshold.value, cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; ++ struct link_queue *queue = cpu_infos[cpu].ce_queue; ++ unsigned int tmp; ++ /* ++ * Since we just count all error numbers in setted cycle, we store the time ++ * and error numbers from current event to the queue, then everytime we ++ * calculate the period from beginning time to ending time, if the period ++ * exceeds setted cycle, we pop the beginning time and error until the period ++ * from new beginning time to ending time is less than cycle. ++ */ ++ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { ++ tmp = queue->head->value; ++ if (pop(queue) == 0) ++ cpu_infos[cpu].ce_nums -= tmp; ++ } ++ log(TERM, LOG_INFO, ++ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n", ++ cpu, cpu_infos[cpu].ce_nums); ++ ++ if (cpu_infos[cpu].ce_nums >= threshold.value) { ++ log(TERM, LOG_INFO, ++ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n", ++ threshold.value, cpu); ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; +} + -+static int do_uce_handler(unsigned cpu) ++static int error_handler(unsigned int cpu, struct error_info *err_info) +{ -+ if (cpu_infos[cpu].uce_nums > 0) { -+ log(TERM, LOG_INFO, "Uncorrected Errors occured, try to offline cpu%d\n", cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; -+} ++ int ret = HANDLE_NOTHING; + -+static int error_handler(unsigned cpu, struct error_info *err_info) -+{ -+ int ret = HANDLE_NOTHING; -+ -+ switch (err_info->err_type) -+ { -+ case CE: -+ ret = do_ce_handler(cpu); -+ break; -+ case UCE: -+ ret = do_uce_handler(cpu); -+ break; -+ default: -+ break; -+ } -+ -+ return ret; ++ switch (err_info->err_type) { ++ case CE: ++ ret = do_ce_handler(cpu); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; +} + -+static void record_error_info(unsigned cpu, struct error_info *err_info) ++static void record_error_info(unsigned int cpu, struct error_info *err_info) +{ -+ switch (err_info->err_type) -+ { -+ case CE: -+ { -+ struct queue_node *node = NULL; -+ node = node_create(err_info->time, err_info->nums); -+ if (node == NULL) { -+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); -+ return; -+ } -+ push(cpu_infos[cpu].ce_queue, node); -+ cpu_infos[cpu].ce_nums += err_info->nums; -+ break; -+ } -+ case UCE: -+ cpu_infos[cpu].uce_nums++; -+ break; -+ default: -+ break; -+ } ++ switch (err_info->err_type) { ++ case CE: ++ { ++ struct queue_node *node = node_create(err_info->time, err_info->nums); ++ ++ if (node == NULL) { ++ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); ++ return; ++ } ++ push(cpu_infos[cpu].ce_queue, node); ++ cpu_infos[cpu].ce_nums += err_info->nums; ++ break; ++ } ++ default: ++ break; ++ } +} + -+static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) ++void ras_record_cpu_error(struct error_info *err_info, int cpu) +{ -+ value >>= offset; -+ unsigned long res = 0; -+ int i = 0; ++ int ret; + -+ while (i < size) { -+ res |= (value & (0x1 << (i++))); -+ } ++ if (enabled == 0) ++ return; + -+ return res; -+} ++ if (cpu >= ncores || cpu < 0) { ++ log(TERM, LOG_ERR, ++ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores); ++ return; ++ } + -+static unsigned get_cpu_index(int64_t mpidr) -+{ -+ unsigned core_id, socket_id, die_id, cpu; -+ /* -+ * Adapt to certain BIOS -+ * In the MPIDR: -+ * bit 8:15: core id -+ * bit 19:20: die_id -+ * bit 21:22: socket_id -+ */ -+ core_id = get_bit_value(mpidr, 8, 8); -+ socket_id = get_bit_value(mpidr, 21, 2); -+ die_id = get_bit_value(mpidr, 19, 2); -+ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die; -+ -+ return cpu; -+} ++ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); ++ cpu_infos[cpu].state = get_cpu_status(cpu); + -+void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) -+{ -+ unsigned cpu; -+ int ret; -+ -+ if (enabled == 0) { -+ return; -+ } -+ -+ cpu = get_cpu_index(mpidr); -+ -+ if (cpu >= ncores) { -+ log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); -+ cpu_infos[cpu].state = get_cpu_status(cpu); -+ -+ if (cpu_infos[cpu].state != CPU_ONLINE) { -+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); -+ return; -+ } -+ -+ record_error_info(cpu, err_info); -+ /* Since user may change cpu state, we get current offlined cpu numbers every recording time. */ -+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { -+ log(TERM, LOG_WARNING, "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", -+ cpu_limit.value); -+ return; -+ } -+ -+ ret = error_handler(cpu, err_info); -+ -+ if (ret == HANDLE_NOTHING) { -+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); -+ } -+ else if (ret == HANDLE_SUCCEED) { -+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+ clear_queue(cpu_infos[cpu].ce_queue); -+ } -+ else { -+ log(TERM, LOG_INFO, "Offline cpu%d fail, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+ } -+ -+ return; ++ if (cpu_infos[cpu].state != CPU_ONLINE) { ++ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); ++ return; ++ } ++ ++ record_error_info(cpu, err_info); ++ /* ++ * Since user may change cpu state, we get current offlined ++ * cpu numbers every recording time. ++ */ ++ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { ++ log(TERM, LOG_WARNING, ++ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", ++ cpu_limit.value); ++ return; ++ } ++ ++ ret = error_handler(cpu, err_info); ++ if (ret == HANDLE_NOTHING) ++ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); ++ else if (ret == HANDLE_SUCCEED) { ++ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); ++ clear_queue(cpu_infos[cpu].ce_queue); ++ cpu_infos[cpu].ce_nums = 0; ++ } else ++ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..a7d3fdb +index 0000000..1159853 --- /dev/null +++ b/ras-cpu-isolation.h -@@ -0,0 +1,76 @@ +@@ -0,0 +1,68 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -908,7 +844,7 @@ index 0000000..a7d3fdb + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. -+*/ ++ */ + +#ifndef __RAS_CPU_ISOLATION_H +#define __RAS_CPU_ISOLATION_H @@ -917,67 +853,58 @@ index 0000000..a7d3fdb + +#define MAX_PATH_LEN 100 +#define MAX_BUF_LEN 1024 -+#define PEI_ERR_SIZE 32 -+#define FLAGS_SIZE 1 + +struct param { -+ char *name; -+ unsigned long value; ++ char *name; ++ unsigned long value; +}; + +struct isolation_param { -+ char *name; -+ const struct param *units; -+ unsigned long value; -+ unsigned long limit; ++ char *name; ++ const struct param *units; ++ unsigned long value; ++ unsigned long limit; +}; + +enum cpu_state { -+ CPU_OFFLINE, -+ CPU_ONLINE, -+ CPU_OFFLINE_FAILED, -+ CPU_UNKNOWN, ++ CPU_OFFLINE, ++ CPU_ONLINE, ++ CPU_OFFLINE_FAILED, ++ CPU_UNKNOWN, +}; + +enum error_handle_result { -+ HANDLE_FAILED = -1, -+ HANDLE_SUCCEED, -+ HANDLE_NOTHING, ++ HANDLE_FAILED = -1, ++ HANDLE_SUCCEED, ++ HANDLE_NOTHING, +}; + +enum error_type { -+ CE = 1, -+ UCE ++ CE = 1 +}; + +struct cpu_info { -+ unsigned long uce_nums; -+ unsigned long ce_nums; -+ struct link_queue *ce_queue; -+ enum cpu_state state; ++ unsigned long ce_nums; ++ struct link_queue *ce_queue; ++ enum cpu_state state; +}; + +struct error_info { -+ unsigned long nums; -+ time_t time; -+ enum error_type err_type; -+}; -+ -+struct cpu_set { -+ char buf[MAX_BUF_LEN]; ++ unsigned long nums; ++ time_t time; ++ enum error_type err_type; +}; + -+void ras_error_count_init(unsigned cpus); -+void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); ++void ras_cpu_isolation_init(unsigned int cpus); ++void ras_record_cpu_error(struct error_info *err_info, int cpu); +void cpu_infos_free(void); + +#endif -\ No newline at end of file diff --git a/ras-events.c b/ras-events.c -index 471d25d..31c4170 100644 +index 39cab20..beda655 100644 --- a/ras-events.c +++ b/ras-events.c -@@ -40,6 +40,7 @@ +@@ -42,6 +42,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-page-isolation.h" @@ -985,43 +912,27 @@ index 471d25d..31c4170 100644 /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -874,6 +875,10 @@ int handle_ras_events(int record_events) +@@ -856,6 +857,10 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); +#ifdef HAVE_CPU_FAULT_ISOLATION -+ ras_error_count_init(cpus); ++ ras_cpu_isolation_init(cpus); +#endif + #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -990,6 +995,9 @@ err: +@@ -982,6 +987,8 @@ err: } free(ras); } +- +#ifdef HAVE_CPU_FAULT_ISOLATION + cpu_infos_free(); +#endif - return rc; } -diff --git a/ras-record.h b/ras-record.h -index cc217a9..b453f83 100644 ---- a/ras-record.h -+++ b/ras-record.h -@@ -77,6 +77,11 @@ struct ras_arm_event { - int64_t midr; - int32_t running_state; - int32_t psci_state; -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ const char *severity; -+ const uint8_t *error_info; -+ uint32_t length; -+#endif - }; - - struct devlink_event { -- -2.27.0 +2.25.1 diff --git a/0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch b/0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch new file mode 100644 index 0000000000000000000000000000000000000000..3012f6b81487491d54d7420e8f580d890e34bcde --- /dev/null +++ b/0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch @@ -0,0 +1,95 @@ +From 2eea64bc7437b0a5dabff52632a372446ddc4765 Mon Sep 17 00:00:00 2001 +From: Xiaofei Tan +Date: Thu, 11 May 2023 10:54:26 +0800 +Subject: [PATCH 1/3] rasdaemon: fix return value type issue of read/write + function from unistd.h + +The return value type of read/write function from unistd.h is ssize_t. +It's signed normally, and return -1 on error. Fix incorrect use in the +function read_ras_event_all_cpus(). + +BTW, make setting buffer_percent as a separate function. + +Fixes: 94750bcf9309 ("rasdaemon: Fix poll() on per_cpu trace_pipe_raw blocks indefinitely") +Signed-off-by: Xiaofei Tan +Signed-off-by: Shiju Jose +--- + ras-events.c | 45 ++++++++++++++++++++++++++++++--------------- + 1 file changed, 30 insertions(+), 15 deletions(-) + +diff --git a/ras-events.c b/ras-events.c +index 6e928a3..d08bf37 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -376,10 +376,37 @@ static int get_num_cpus(struct ras_events *ras) + #endif + } + ++static int set_buffer_percent(struct ras_events *ras, int percent) ++{ ++ char buf[16]; ++ ssize_t size; ++ int res = 0; ++ int fd; ++ ++ fd = open_trace(ras, "buffer_percent", O_WRONLY); ++ if (fd >= 0) { ++ /* For the backward compatibility to the old kernels, do not return ++ * if fail to set the buffer_percent. ++ */ ++ snprintf(buf, sizeof(buf), "%d", percent); ++ size = write(fd, buf, strlen(buf)); ++ if (size <= 0) { ++ log(TERM, LOG_WARNING, "can't write to buffer_percent\n"); ++ res = -1; ++ } ++ close(fd); ++ } else { ++ log(TERM, LOG_WARNING, "Can't open buffer_percent\n"); ++ res = -1; ++ } ++ ++ return res; ++} ++ + static int read_ras_event_all_cpus(struct pthread_data *pdata, + unsigned n_cpus) + { +- unsigned size; ++ ssize_t size; + unsigned long long time_stamp; + void *data; + int ready, i, count_nready; +@@ -391,8 +418,6 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + int warnonce[n_cpus]; + char pipe_raw[PATH_MAX]; + int legacy_kernel = 0; +- int fd; +- char buf[16]; + #if 0 + int need_sleep = 0; + #endif +@@ -419,18 +444,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + * Set buffer_percent to 0 so that poll() will return immediately + * when the trace data is available in the ras per_cpu trace pipe_raw + */ +- fd = open_trace(pdata[0].ras, "buffer_percent", O_WRONLY); +- if (fd >= 0) { +- /* For the backward compatibility to the old kernels, do not return +- * if fail to set the buffer_percent. +- */ +- snprintf(buf, sizeof(buf), "0"); +- size = write(fd, buf, strlen(buf)); +- if (size <= 0) +- log(TERM, LOG_WARNING, "can't write to buffer_percent\n"); +- close(fd); +- } else +- log(TERM, LOG_WARNING, "Can't open buffer_percent\n"); ++ if (set_buffer_percent(pdata[0].ras, 0)) ++ log(TERM, LOG_WARNING, "Set buffer_percent failed\n"); + + for (i = 0; i < (n_cpus + 1); i++) + fds[i].fd = -1; +-- +2.25.1 + diff --git a/0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch b/0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch new file mode 100644 index 0000000000000000000000000000000000000000..2409a5161a77d6c2f8ec984121541b1b6394d535 --- /dev/null +++ b/0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch @@ -0,0 +1,46 @@ +From: Xiaofei Tan +Date: Sat, 20 Aug 2022 09:49:25 +0000 +Subject: [PATCH] rasdaemon: use standard length PATH_MAX for path name + +Use standard length PATH_MAX for path name space allocation +to replace the macro MAX_PATH_LEN. + +Signed-off-by: Xiaofei Tan +--- + ras-cpu-isolation.c | 6 +++--- + ras-cpu-isolation.h | 1 - + 2 files changed, 3 insertions(+), 4 deletions(-) + +diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c +index ba5ccd1..24c07e9 100644 +--- a/ras-cpu-isolation.c ++++ b/ras-cpu-isolation.c +@@ -80,11 +80,11 @@ static const char * const cpu_state[] = { + static int open_sys_file(unsigned int cpu, int __oflag, const char *format) + { + int fd; +- char path[MAX_PATH_LEN + 1] = ""; +- char real_path[MAX_PATH_LEN + 1] = ""; ++ char path[PATH_MAX] = ""; ++ char real_path[PATH_MAX] = ""; + + snprintf(path, sizeof(path), format, cpu); +- if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) { ++ if (strlen(path) > PATH_MAX || realpath(path, real_path) == NULL) { + log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path); + return -1; + } +diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h +index 024a68b..5682106 100644 +--- a/ras-cpu-isolation.h ++++ b/ras-cpu-isolation.h +@@ -17,7 +17,6 @@ + + #include "queue.h" + +-#define MAX_PATH_LEN 100 + #define MAX_BUF_LEN 1024 + + struct param { +-- +2.17.1 diff --git a/0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch b/0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch new file mode 100644 index 0000000000000000000000000000000000000000..85ae07d551d3fad6b7e8f52059f689b1e03cf7c6 --- /dev/null +++ b/0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch @@ -0,0 +1,85 @@ +From 6986d818e6d2c846c001fc7211b5a4153e5ecd11 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sat, 4 Feb 2023 19:15:55 +0000 +Subject: [PATCH] rasdaemon: Fix poll() on per_cpu trace_pipe_raw blocks + indefinitely + +The error events are not received in the rasdaemon since kernel 6.1-rc6. +This issue is firstly detected and reported, when testing the CXL error +events in the rasdaemon. + +Debugging showed, poll() on trace_pipe_raw in the ras-events.c do not +return and this issue is seen after the commit +42fb0a1e84ff525ebe560e2baf9451ab69127e2b ("tracing/ring-buffer: Have +polling block on watermark"). + +This issue is also verified using a test application for poll() +and select() on per_cpu trace_pipe_raw. + +There is also a bug reported on this issue, +https://lore.kernel.org/all/31eb3b12-3350-90a4-a0d9-d1494db7cf74@oracle.com/ + +This issue occurs for the per_cpu case, which calls the ring_buffer_poll_wait(), +in kernel/trace/ring_buffer.c, with the buffer_percent > 0 and then wait until +the percentage of pages are available. The default value set for the +buffer_percent is 50 in the kernel/trace/trace.c. However poll() does not return +even met the percentage of pages condition. + +As a fix, rasdaemon set buffer_percent as 0 through the +/sys/kernel/debug/tracing/instances/rasdaemon/buffer_percent, then the +task will wake up as soon as data is added to any of the specific cpu +buffer and poll() on per_cpu/cpuX/trace_pipe_raw does not block +indefinitely. + +Dependency on the kernel fix commit +3e46d910d8acf94e5360126593b68bf4fee4c4a1("tracing: Fix poll() and select() +do not work on per_cpu trace_pipe and trace_pipe_raw") + +Signed-off-by: Shiju Jose +--- + ras-events.c | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/ras-events.c b/ras-events.c +index 39f9ce2..49e4f9a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -376,6 +376,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + int warnonce[n_cpus]; + char pipe_raw[PATH_MAX]; + int legacy_kernel = 0; ++ int fd; ++ char buf[16]; + #if 0 + int need_sleep = 0; + #endif +@@ -395,6 +397,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + return -ENOMEM; + } + ++ /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks ++ * indefinitely with the default buffer_percent in the kernel trace system, ++ * which is introduced by the following change in the kernel. ++ * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u. ++ * Set buffer_percent to 0 so that poll() will return immediately ++ * when the trace data is available in the ras per_cpu trace pipe_raw ++ */ ++ fd = open_trace(pdata[0].ras, "buffer_percent", O_WRONLY); ++ if (fd >= 0) { ++ /* For the backward compatibility to the old kernels, do not return ++ * if fail to set the buffer_percent. ++ */ ++ snprintf(buf, sizeof(buf), "0"); ++ size = write(fd, buf, strlen(buf)); ++ if (size <= 0) ++ log(TERM, LOG_WARNING, "can't write to buffer_percent\n"); ++ close(fd); ++ } else ++ log(TERM, LOG_WARNING, "Can't open buffer_percent\n"); ++ + for (i = 0; i < (n_cpus + 1); i++) + fds[i].fd = -1; + +-- +2.25.1 + diff --git a/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch b/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch new file mode 100644 index 0000000000000000000000000000000000000000..e401fa99e6f3dc0f7e2a021a14e4f8dc57643ae9 --- /dev/null +++ b/0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch @@ -0,0 +1,150 @@ +From fefa2d689f96302e64ad2375695703039e2ca951 Mon Sep 17 00:00:00 2001 +From: Shengwei Luo +Date: Wed, 23 Feb 2022 17:23:27 +0800 +Subject: [PATCH 02/10] rasdaemon: Support cpu fault isolation for recoverable + errors + +When the recoverable errors in cpu core occurred, try to offline +the related cpu core. + +Signed-off-by: Shengwei Luo +Signed-off-by: Junchong Pan +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose +--- + ras-arm-handler.c | 22 +++++++++++++++++++--- + ras-cpu-isolation.c | 17 +++++++++++++++++ + ras-cpu-isolation.h | 4 +++- + 3 files changed, 39 insertions(+), 4 deletions(-) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 9c7a3c3..a0dfc51 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -26,6 +26,7 @@ + + #define ARM_ERR_VALID_ERROR_COUNT BIT(0) + #define ARM_ERR_VALID_FLAGS BIT(1) ++#define BIT2 2 + + void display_raw_data(struct trace_seq *s, + const uint8_t *buf, +@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s, + } + + #ifdef HAVE_CPU_FAULT_ISOLATION +-static int count_errors(struct ras_arm_event *ev) ++static int is_core_failure(struct ras_arm_err_info *err_info) ++{ ++ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) { ++ /* ++ * core failure: ++ * Bit 0\1\3: (at lease 1) ++ * Bit 2: 0 ++ */ ++ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2)); ++ } ++ return 0; ++} ++ ++static int count_errors(struct ras_arm_event *ev, int sev) + { + struct ras_arm_err_info *err_info; + int num_pei; +@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev) + */ + error_count = err_info->multiple_error + 1; + } ++ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info)) ++ error_count = 0; + + num += error_count; + err_info += 1; +@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s, + } + trace_seq_printf(s, "\n severity: %s", severity); + +- if (val == GHES_SEV_CORRECTED) { +- int nums = count_errors(ev); ++ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { ++ int nums = count_errors(ev, val); + + if (nums > 0) { + err_info.nums = nums; +diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c +index abcf451..fd23e4e 100644 +--- a/ras-cpu-isolation.c ++++ b/ras-cpu-isolation.c +@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus) + + for (unsigned int i = 0; i < cpus; ++i) { + cpu_infos[i].ce_nums = 0; ++ cpu_infos[i].uce_nums = 0; + cpu_infos[i].state = get_cpu_status(i); + cpu_infos[i].ce_queue = init_queue(); + +@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu) + return HANDLE_NOTHING; + } + ++static int do_uce_handler(unsigned int cpu) ++{ ++ if (cpu_infos[cpu].uce_nums > 0) { ++ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu); ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; ++} ++ + static int error_handler(unsigned int cpu, struct error_info *err_info) + { + int ret = HANDLE_NOTHING; +@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) + case CE: + ret = do_ce_handler(cpu); + break; ++ case UCE: ++ ret = do_uce_handler(cpu); ++ break; + default: + break; + } +@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) + cpu_infos[cpu].ce_nums += err_info->nums; + break; + } ++ case UCE: ++ cpu_infos[cpu].uce_nums++; ++ break; + default: + break; + } +@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) + cpu, cpu_state[cpu_infos[cpu].state]); + clear_queue(cpu_infos[cpu].ce_queue); + cpu_infos[cpu].ce_nums = 0; ++ cpu_infos[cpu].uce_nums = 0; + } else + log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", + cpu, cpu_state[cpu_infos[cpu].state]); +diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h +index 1159853..024a68b 100644 +--- a/ras-cpu-isolation.h ++++ b/ras-cpu-isolation.h +@@ -46,10 +46,12 @@ enum error_handle_result { + }; + + enum error_type { +- CE = 1 ++ CE = 1, ++ UCE + }; + + struct cpu_info { ++ unsigned long uce_nums; + unsigned long ce_nums; + struct link_queue *ce_queue; + enum cpu_state state; +-- +2.25.1 + diff --git a/0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch b/0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..89ecf80b7fdbf92329711d56540a72d2f6423046 --- /dev/null +++ b/0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch @@ -0,0 +1,114 @@ +From ad9c1bc8ea907d6faebfb916916b5f898a8e0518 Mon Sep 17 00:00:00 2001 +From: Xiaofei Tan +Date: Tue, 30 May 2023 11:44:12 +0100 +Subject: [PATCH 2/3] rasdaemon: fix issue of signed and unsigned integer + comparison and remove redundant header file + +1. The return value of ARRAY_SIZE() is unsigned integer. It isn't right to +compare it with a signed integer. This patch fix them. + +2. Remove redundant header file and adjust the header files sequence. + +Signed-off-by: Xiaofei Tan +Signed-off-by: Shiju Jose +--- + non-standard-hisi_hip08.c | 2 +- + non-standard-hisilicon.c | 8 ++++---- + ras-diskerror-handler.c | 2 +- + ras-memory-failure-handler.c | 7 +++---- + 4 files changed, 9 insertions(+), 10 deletions(-) + +diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c +index 4ef47ea..61f12eb 100644 +--- a/non-standard-hisi_hip08.c ++++ b/non-standard-hisi_hip08.c +@@ -1029,7 +1029,7 @@ static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { + + static void __attribute__((constructor)) hip08_init(void) + { +- int i; ++ unsigned int i; + + for (i = 0; i < ARRAY_SIZE(hip08_ns_ev_decoder); i++) + register_ns_ev_decoder(&hip08_ns_ev_decoder[i]); +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 6ee9271..0d5fe6b 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -362,13 +362,13 @@ static int decode_hisi_common_section(struct ras_events *ras, + trace_seq_printf(s, "%s\n", hevent.error_msg); + + if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) { +- int i; ++ unsigned int i; + + trace_seq_printf(s, "Register Dump:\n"); + for (i = 0; i < err->reg_array_size / sizeof(uint32_t); i++) { +- trace_seq_printf(s, "reg%02d=0x%08x\n", i, ++ trace_seq_printf(s, "reg%02u=0x%08x\n", i, + err->reg_array[i]); +- HISI_SNPRINTF(hevent.reg_msg, "reg%02d=0x%08x", ++ HISI_SNPRINTF(hevent.reg_msg, "reg%02u=0x%08x", + i, err->reg_array[i]); + } + } +@@ -394,7 +394,7 @@ static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { + + static void __attribute__((constructor)) hisi_ns_init(void) + { +- int i; ++ unsigned int i; + + for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++) + register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]); +diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c +index b16319f..b46f859 100644 +--- a/ras-diskerror-handler.c ++++ b/ras-diskerror-handler.c +@@ -52,7 +52,7 @@ static const struct { + + static const char *get_blk_error(int err) + { +- int i; ++ unsigned int i; + + for (i = 0; i < ARRAY_SIZE(blk_errors); i++) + if (blk_errors[i].error == err) +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 9941e68..8fd7117 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -15,11 +15,10 @@ + #include + #include + #include +-#include "libtrace/kbuffer.h" +-#include "ras-memory-failure-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" ++#include "ras-memory-failure-handler.h" + + /* Memory failure - various types of pages */ + enum mf_action_page_type { +@@ -99,7 +98,7 @@ static const struct { + + static const char *get_page_type(int page_type) + { +- int i; ++ unsigned int i; + + for (i = 0; i < ARRAY_SIZE(mf_page_type); i++) + if (mf_page_type[i].type == page_type) +@@ -110,7 +109,7 @@ static const char *get_page_type(int page_type) + + static const char *get_action_result(int result) + { +- int i; ++ unsigned int i; + + for (i = 0; i < ARRAY_SIZE(mf_action_result); i++) + if (mf_action_result[i].result == result) +-- +2.25.1 + diff --git a/0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch b/0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch new file mode 100644 index 0000000000000000000000000000000000000000..4efb348cd7f617014a6364868987ff816efafef8 --- /dev/null +++ b/0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch @@ -0,0 +1,325 @@ +From 9fd84965e70b6d245699d36f8ac4f260d87013cb Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 1 Jun 2023 15:34:53 +0100 +Subject: [PATCH 3/3] rasdaemon: Add support for creating the vendor error + tables at startup + +1. Support for create/open the vendor error tables at rasdaemon startup. +2. Make changes in the HiSilicon error handling code for the same. + +Signed-off-by: Shiju Jose +--- + non-standard-hisi_hip08.c | 66 ++++++++++++++++++++++++++------------ + non-standard-hisilicon.c | 28 ++++++++++------ + ras-events.c | 17 +++++++++- + ras-non-standard-handler.c | 35 +++++++++++++++++++- + ras-non-standard-handler.h | 3 ++ + 5 files changed, 116 insertions(+), 33 deletions(-) + +diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c +index 61f12eb..be84c22 100644 +--- a/non-standard-hisi_hip08.c ++++ b/non-standard-hisi_hip08.c +@@ -654,6 +654,20 @@ static void decode_oem_type1_err_regs(struct ras_ns_ev_decoder *ev_decoder, + step_vendor_data_tab(ev_decoder, "hip08_oem_type1_event_tab"); + } + ++static int add_hip08_oem_type1_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder) ++{ ++#ifdef HAVE_SQLITE3 ++ if (ras->record_events && !ev_decoder->stmt_dec_record) { ++ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, ++ &hip08_oem_type1_event_tab) != SQLITE_OK) { ++ log(TERM, LOG_WARNING, "Failed to create sql hip08_oem_type1_event_tab\n"); ++ return -1; ++ } ++ } ++#endif ++ return 0; ++} ++ + /* error data decoding functions */ + static int decode_hip08_oem_type1_error(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, +@@ -669,17 +683,6 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras, + return -1; + } + +-#ifdef HAVE_SQLITE3 +- if (ras->record_events && !ev_decoder->stmt_dec_record) { +- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, +- &hip08_oem_type1_event_tab) +- != SQLITE_OK) { +- trace_seq_printf(s, +- "create sql hip08_oem_type1_event_tab fail\n"); +- return -1; +- } +- } +-#endif + record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, + HIP08_OEM_TYPE1_FIELD_TIMESTAMP, + 0, event->timestamp); +@@ -827,6 +830,20 @@ static void decode_oem_type2_err_regs(struct ras_ns_ev_decoder *ev_decoder, + step_vendor_data_tab(ev_decoder, "hip08_oem_type2_event_tab"); + } + ++static int add_hip08_oem_type2_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder) ++{ ++#ifdef HAVE_SQLITE3 ++ if (ras->record_events && !ev_decoder->stmt_dec_record) { ++ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, ++ &hip08_oem_type2_event_tab) != SQLITE_OK) { ++ log(TERM, LOG_WARNING, "Failed to create sql hip08_oem_type2_event_tab\n"); ++ return -1; ++ } ++ } ++#endif ++ return 0; ++} ++ + static int decode_hip08_oem_type2_error(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -977,6 +994,20 @@ static void decode_pcie_local_err_regs(struct ras_ns_ev_decoder *ev_decoder, + step_vendor_data_tab(ev_decoder, "hip08_pcie_local_event_tab"); + } + ++static int add_hip08_pcie_local_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder) ++{ ++#ifdef HAVE_SQLITE3 ++ if (ras->record_events && !ev_decoder->stmt_dec_record) { ++ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, ++ &hip08_pcie_local_event_tab) != SQLITE_OK) { ++ log(TERM, LOG_WARNING, "Failed to create sql hip08_pcie_local_event_tab\n"); ++ return -1; ++ } ++ } ++#endif ++ return 0; ++} ++ + static int decode_hip08_pcie_local_error(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -991,16 +1022,6 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, + return -1; + } + +-#ifdef HAVE_SQLITE3 +- if (ras->record_events && !ev_decoder->stmt_dec_record) { +- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, +- &hip08_pcie_local_event_tab) != SQLITE_OK) { +- trace_seq_printf(s, +- "create sql hip08_pcie_local_event_tab fail\n"); +- return -1; +- } +- } +-#endif + record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, + HIP08_PCIE_LOCAL_FIELD_TIMESTAMP, + 0, event->timestamp); +@@ -1015,14 +1036,17 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, + static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { + { + .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5", ++ .add_table = add_hip08_oem_type1_table, + .decode = decode_hip08_oem_type1_error, + }, + { + .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d", ++ .add_table = add_hip08_oem_type2_table, + .decode = decode_hip08_oem_type2_error, + }, + { + .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772", ++ .add_table = add_hip08_pcie_local_table, + .decode = decode_hip08_pcie_local_error, + }, + }; +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 0d5fe6b..0ddb5ec 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -337,6 +337,23 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, + HISI_SNPRINTF(event->error_msg, "]"); + } + ++static int add_hisi_common_table(struct ras_events *ras, ++ struct ras_ns_ev_decoder *ev_decoder) ++{ ++#ifdef HAVE_SQLITE3 ++ if (ras->record_events && ++ !ev_decoder->stmt_dec_record) { ++ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, ++ &hisi_common_section_tab) != SQLITE_OK) { ++ log(TERM, LOG_WARNING, "Failed to create sql hisi_common_section_tab\n"); ++ return -1; ++ } ++ } ++#endif ++ ++ return 0; ++} ++ + static int decode_hisi_common_section(struct ras_events *ras, + struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, +@@ -346,16 +363,6 @@ static int decode_hisi_common_section(struct ras_events *ras, + (struct hisi_common_error_section *)event->error; + struct hisi_event hevent; + +-#ifdef HAVE_SQLITE3 +- if (ras->record_events && !ev_decoder->stmt_dec_record) { +- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, +- &hisi_common_section_tab) != SQLITE_OK) { +- trace_seq_printf(s, "create sql hisi_common_section_tab fail\n"); +- return -1; +- } +- } +-#endif +- + memset(&hevent, 0, sizeof(struct hisi_event)); + trace_seq_printf(s, "\nHisilicon Common Error Section:\n"); + decode_hisi_common_section_hdr(ev_decoder, err, &hevent); +@@ -388,6 +395,7 @@ static int decode_hisi_common_section(struct ras_events *ras, + static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { + { + .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586", ++ .add_table = add_hisi_common_table, + .decode = decode_hisi_common_section, + }, + }; +diff --git a/ras-events.c b/ras-events.c +index d08bf37..fc54325 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -482,6 +482,10 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + if (pdata[0].ras->record_events) { + if (ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras)) + goto error; ++#ifdef HAVE_NON_STANDARD ++ if (ras_ns_add_vendor_tables(pdata[0].ras)) ++ log(TERM, LOG_ERR, "Can't add vendor table\n"); ++#endif + } + + do { +@@ -566,8 +570,12 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + "Old kernel detected. Stop listening and fall back to pthread way.\n"); + + cleanup: +- if (pdata[0].ras->record_events) ++ if (pdata[0].ras->record_events) { ++#ifdef HAVE_NON_STANDARD ++ ras_ns_finalize_vendor_tables(); ++#endif + ras_mc_event_closedb(pdata[0].cpu, pdata[0].ras); ++ } + + error: + kbuffer_free(kbuf); +@@ -664,6 +672,10 @@ static void *handle_ras_events_cpu(void *priv) + free(page); + return 0; + } ++#ifdef HAVE_NON_STANDARD ++ if (ras_ns_add_vendor_tables(pdata->ras)) ++ log(TERM, LOG_ERR, "Can't add vendor table\n"); ++#endif + pthread_mutex_unlock(&pdata->ras->db_lock); + } + +@@ -671,6 +683,9 @@ static void *handle_ras_events_cpu(void *priv) + + if (pdata->ras->record_events) { + pthread_mutex_lock(&pdata->ras->db_lock); ++#ifdef HAVE_NON_STANDARD ++ ras_ns_finalize_vendor_tables(); ++#endif + ras_mc_event_closedb(pdata->cpu, pdata->ras); + pthread_mutex_unlock(&pdata->ras->db_lock); + } +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 6932e58..20d514b 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -75,6 +75,32 @@ int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) + return 0; + } + ++int ras_ns_add_vendor_tables(struct ras_events *ras) ++{ ++ struct ras_ns_ev_decoder *ns_ev_decoder; ++ int error = 0; ++ ++#ifdef HAVE_SQLITE3 ++ if (!ras) ++ return -1; ++ ++ ns_ev_decoder = ras_ns_ev_dec_list; ++ while (ns_ev_decoder) { ++ if (ns_ev_decoder->add_table && !ns_ev_decoder->stmt_dec_record) { ++ error = ns_ev_decoder->add_table(ras, ns_ev_decoder); ++ if (error) ++ break; ++ } ++ ns_ev_decoder = ns_ev_decoder->next; ++ } ++ ++ if (error) ++ return -1; ++#endif ++ ++ return 0; ++} ++ + static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p_ns_ev_dec) + { + struct ras_ns_ev_decoder *ns_ev_decoder; +@@ -96,7 +122,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p + return 0; + } + +-static void unregister_ns_ev_decoder(void) ++void ras_ns_finalize_vendor_tables(void) + { + #ifdef HAVE_SQLITE3 + struct ras_ns_ev_decoder *ns_ev_decoder = ras_ns_ev_dec_list; +@@ -108,6 +134,13 @@ static void unregister_ns_ev_decoder(void) + } + ns_ev_decoder = ns_ev_decoder->next; + } ++#endif ++} ++ ++static void unregister_ns_ev_decoder(void) ++{ ++#ifdef HAVE_SQLITE3 ++ ras_ns_finalize_vendor_tables(); + #endif + ras_ns_ev_dec_list = NULL; + } +diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h +index 57d4cb5..341206a 100644 +--- a/ras-non-standard-handler.h ++++ b/ras-non-standard-handler.h +@@ -23,6 +23,7 @@ + struct ras_ns_ev_decoder { + struct ras_ns_ev_decoder *next; + const char *sec_type; ++ int (*add_table)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder); + int (*decode)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, struct ras_non_standard_event *event); + #ifdef HAVE_SQLITE3 +@@ -39,6 +40,8 @@ void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index); + + #ifdef HAVE_NON_STANDARD + int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder); ++int ras_ns_add_vendor_tables(struct ras_events *ras); ++void ras_ns_finalize_vendor_tables(void); + #else + static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; }; + #endif +-- +2.25.1 + diff --git a/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch b/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch new file mode 100644 index 0000000000000000000000000000000000000000..c51e35a16f2335b969b642c59fdf7165eaf987f2 --- /dev/null +++ b/0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch @@ -0,0 +1,228 @@ +From 9c4665f33c39ea84db7d69079ab27205d2fbd07e Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Wed, 2 Mar 2022 12:20:40 +0000 +Subject: [PATCH 03/10] rasdaemon: Modify recording Hisilicon common error data + +The error statistics for the Hisilicon common +error need to do based on module, error severity etc. + +Modify recording Hisilicon common error data as separate fields +in the sql db table instead of the combined single field. + +Signed-off-by: Shiju Jose +--- + non-standard-hisilicon.c | 126 ++++++++++++++++++++++++++++++++------- + 1 file changed, 104 insertions(+), 22 deletions(-) + +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 1432163..d1e1774 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -17,6 +17,7 @@ + #include "non-standard-hisilicon.h" + + #define HISI_BUF_LEN 2048 ++#define HISI_PCIE_INFO_BUF_LEN 256 + + struct hisi_common_error_section { + uint32_t val_bits; +@@ -63,12 +64,25 @@ enum { + enum { + HISI_COMMON_FIELD_ID, + HISI_COMMON_FIELD_TIMESTAMP, +- HISI_COMMON_FIELD_ERR_INFO, ++ HISI_COMMON_FIELD_VERSION, ++ HISI_COMMON_FIELD_SOC_ID, ++ HISI_COMMON_FIELD_SOCKET_ID, ++ HISI_COMMON_FIELD_TOTEM_ID, ++ HISI_COMMON_FIELD_NIMBUS_ID, ++ HISI_COMMON_FIELD_SUB_SYSTEM_ID, ++ HISI_COMMON_FIELD_MODULE_ID, ++ HISI_COMMON_FIELD_SUB_MODULE_ID, ++ HISI_COMMON_FIELD_CORE_ID, ++ HISI_COMMON_FIELD_PORT_ID, ++ HISI_COMMON_FIELD_ERR_TYPE, ++ HISI_COMMON_FIELD_PCIE_INFO, ++ HISI_COMMON_FIELD_ERR_SEVERITY, + HISI_COMMON_FIELD_REGS_DUMP, + }; + + struct hisi_event { + char error_msg[HISI_BUF_LEN]; ++ char pcie_info[HISI_PCIE_INFO_BUF_LEN]; + char reg_msg[HISI_BUF_LEN]; + }; + +@@ -132,14 +146,26 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) + + #ifdef HAVE_SQLITE3 + static const struct db_fields hisi_common_section_fields[] = { +- { .name = "id", .type = "INTEGER PRIMARY KEY" }, +- { .name = "timestamp", .type = "TEXT" }, +- { .name = "err_info", .type = "TEXT" }, ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "version", .type = "INTEGER" }, ++ { .name = "soc_id", .type = "INTEGER" }, ++ { .name = "socket_id", .type = "INTEGER" }, ++ { .name = "totem_id", .type = "INTEGER" }, ++ { .name = "nimbus_id", .type = "INTEGER" }, ++ { .name = "sub_system_id", .type = "INTEGER" }, ++ { .name = "module_id", .type = "TEXT" }, ++ { .name = "sub_module_id", .type = "INTEGER" }, ++ { .name = "core_id", .type = "INTEGER" }, ++ { .name = "port_id", .type = "INTEGER" }, ++ { .name = "err_type", .type = "INTEGER" }, ++ { .name = "pcie_info", .type = "TEXT" }, ++ { .name = "err_severity", .type = "TEXT" }, + { .name = "regs_dump", .type = "TEXT" }, + }; + + static const struct db_table_descriptor hisi_common_section_tab = { +- .name = "hisi_common_section", ++ .name = "hisi_common_section_v2", + .fields = hisi_common_section_fields, + .num_fields = ARRAY_SIZE(hisi_common_section_fields), + }; +@@ -199,12 +225,20 @@ static const char* get_soc_desc(uint8_t soc_id) + return soc_desc[soc_id]; + } + +-static void decode_module(struct hisi_event *event, uint8_t module_id) ++static void decode_module(struct ras_ns_ev_decoder *ev_decoder, ++ struct hisi_event *event, uint8_t module_id) + { +- if (module_id >= sizeof(module_name)/sizeof(char *)) ++ if (module_id >= sizeof(module_name)/sizeof(char *)) { + HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id); +- else ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_MODULE_ID, ++ 0, "unknown"); ++ } else { + HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_MODULE_ID, ++ 0, module_name[module_id]); ++ } + } + + static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, +@@ -212,43 +246,93 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, + struct hisi_event *event) + { + HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version); +- if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_VERSION, ++ err->version, NULL); ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) { + HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id)); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SOC_ID, ++ err->soc_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) { + HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SOCKET_ID, ++ err->socket_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) { + HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_TOTEM_ID, ++ err->totem_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) { + HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_NIMBUS_ID, ++ err->nimbus_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) { + HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SUB_SYSTEM_ID, ++ err->subsystem_id, NULL); ++ } + + if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID)) +- decode_module(event, err->module_id); ++ decode_module(ev_decoder, event, err->module_id); + +- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) { + HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_SUB_MODULE_ID, ++ err->submodule_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) { + HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_CORE_ID, ++ err->core_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) { + HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_PORT_ID, ++ err->port_id, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) { + HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, ++ HISI_COMMON_FIELD_ERR_TYPE, ++ err->err_type, NULL); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) { + HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x", + err->pcie_info.segment, err->pcie_info.bus, + err->pcie_info.device, err->pcie_info.function); ++ HISI_SNPRINTF(event->pcie_info, "%04x:%02x:%02x.%x", ++ err->pcie_info.segment, err->pcie_info.bus, ++ err->pcie_info.device, err->pcie_info.function); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_PCIE_INFO, ++ 0, event->pcie_info); ++ } + +- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) ++ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) { + HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity)); ++ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, ++ HISI_COMMON_FIELD_ERR_SEVERITY, ++ 0, err_severity(err->err_severity)); ++ } + + HISI_SNPRINTF(event->error_msg, "]"); + } +@@ -293,8 +377,6 @@ static int decode_hisi_common_section(struct ras_events *ras, + record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, + HISI_COMMON_FIELD_TIMESTAMP, + 0, event->timestamp); +- record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, +- HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg); + record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, + HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg); + step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); +-- +2.25.1 + diff --git a/0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch b/0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch new file mode 100644 index 0000000000000000000000000000000000000000..e1c86b41ff383c4bbe5950717aecae73b64dbd35 --- /dev/null +++ b/0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch @@ -0,0 +1,35 @@ +From c46f65e1315aab8585e24d24223bd56c8931202a Mon Sep 17 00:00:00 2001 +From: Xiaofei Tan +Date: Mon, 31 Oct 2022 18:36:26 +0800 +Subject: [PATCH 4/4] rasdaemon: Add four modules supported by HiSilicon common + section + +Add four modules supported by HiSilicon common error section. + +Signed-off-by: Xiaofei Tan +Signed-off-by: Shiju Jose +Signed-off-by: Mauro Carvalho Chehab +--- + non-standard-hisilicon.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index 0ddb5ec..7296d28 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -214,7 +214,11 @@ static const char* module_name[] = { + "Tsensor", + "ROH", + "BTC", +- "HILINK" ++ "HILINK", ++ "STARS", ++ "SDMA", ++ "UC", ++ "HBMC", + }; + + static const char* get_soc_desc(uint8_t soc_id) +-- +2.25.1 + diff --git a/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch b/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch new file mode 100644 index 0000000000000000000000000000000000000000..8963d91b230e9059c02469875b4548a9541ec085 --- /dev/null +++ b/0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch @@ -0,0 +1,97 @@ +From 4f706ff3b1a04de3be506a309e153b99e04b3445 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 24 Feb 2022 18:02:14 +0000 +Subject: [PATCH 04/10] rasdaemon: ras-mc-ctl: Modify error statistics for + HiSilicon KunPeng9xx common errors + +Modify the error statistics for the HiSilicon KunPeng9xx platforms common errors +to display the statistics and error info based on the module and the error severity. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 40 +++++++++++++++++++++++++++++----------- + 1 file changed, 29 insertions(+), 11 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index b22dd60..08eb287 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1537,7 +1537,7 @@ sub vendor_errors_summary + require DBI; + my ($num_args, $platform_id); + my ($query, $query_handle, $count, $out); +- my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info); ++ my ($module_id, $sub_module_id, $err_severity, $err_sev); + + $num_args = $#ARGV + 1; + $platform_id = 0; +@@ -1614,13 +1614,18 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $query = "select err_info, count(*) from hisi_common_section"; ++ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +- $query_handle->bind_columns(\($err_info, $count)); ++ $query_handle->bind_columns(\($err_severity, $module_id, $count)); + $out = ""; ++ $err_sev = ""; + while($query_handle->fetch()) { +- $out .= "\terrors: $count\n"; ++ if ($err_severity ne $err_sev) { ++ $out .= "$err_severity errors:\n"; ++ $err_sev = $err_severity; ++ } ++ $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; +@@ -1638,8 +1643,8 @@ sub vendor_errors + require DBI; + my ($num_args, $platform_id); + my ($query, $query_handle, $id, $timestamp, $out); +- my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id); +- my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs); ++ my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); ++ my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); + + $num_args = $#ARGV + 1; + $platform_id = 0; +@@ -1727,15 +1732,28 @@ sub vendor_errors + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id"; ++ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +- $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs)); ++ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp "; +- $out .= "Error Info:$err_info \n" if ($err_info); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "totem_id=$totem_id, " if ($totem_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "core_id=$core_id, " if ($core_id); ++ $out .= "port_id=$port_id, " if ($port_id); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "pcie_info=$pcie_info, " if ($pcie_info); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs" if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +-- +2.25.1 + diff --git a/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch b/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch new file mode 100644 index 0000000000000000000000000000000000000000..2ff9537371c838036014fb04bfc7900ace448181 --- /dev/null +++ b/0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch @@ -0,0 +1,56 @@ +From f5c3c03039be28bb6b5bbe00e12e9586b19a1060 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sat, 5 Mar 2022 16:18:55 +0000 +Subject: [PATCH 05/10] rasdaemon: ras-mc-ctl: Reformat error info of the + HiSilicon Kunpeng920 + +Reformat the code to display the error info of HiSilicon Kunpeng920. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 08eb287..8755b6f 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1671,8 +1671,9 @@ sub vendor_errors + $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); + $out .= "module_id=$module_id, " if ($module_id); + $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, \n" if ($err_severity); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; +@@ -1694,8 +1695,9 @@ sub vendor_errors + $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); + $out .= "module_id=$module_id, " if ($module_id); + $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, \n" if ($err_severity); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; +@@ -1719,8 +1721,9 @@ sub vendor_errors + $out .= "core_id=$core_id, " if ($core_id); + $out .= "port_id=$port_id, " if ($port_id); + $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "err_type=$err_type, \n" if ($err_type); +- $out .= "Error Registers: $regs\n\n" if ($regs); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; + } + if ($out ne "") { + print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; +-- +2.25.1 + diff --git a/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch b/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch new file mode 100644 index 0000000000000000000000000000000000000000..1ff38e399c290adff15b3c29a499cbcf76d16bf7 --- /dev/null +++ b/0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch @@ -0,0 +1,37 @@ +From d595a9d61f9d8341a5e30d4d800e3237d6e0f390 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sat, 5 Mar 2022 17:01:35 +0000 +Subject: [PATCH 06/10] rasdaemon: ras-mc-ctl: Add printing usage if necessary + parameters are not passed for the vendor-error options + +Add printing usage if necessary parameters are not passed +for the vendor-errors options. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 8755b6f..959ea6b 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1544,6 +1544,7 @@ sub vendor_errors_summary + if ($num_args ne 0) { + $platform_id = $ARGV[0]; + } else { ++ usage(1); + return; + } + +@@ -1651,6 +1652,7 @@ sub vendor_errors + if ($num_args ne 0) { + $platform_id = $ARGV[0]; + } else { ++ usage(1); + return; + } + +-- +2.25.1 + diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch deleted file mode 100644 index 08361e6cdacc20a9b5bccc5fc251e7014763b7f0..0000000000000000000000000000000000000000 --- a/0007-add-trace-print-and-add-sqlite-store.patch +++ /dev/null @@ -1,78 +0,0 @@ -From 57640072aead2e00037749d66f05fc26e3fe3071 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Tue, 25 May 2021 20:07:26 +0800 -Subject: [PATCH 2/2] add trace print of new information and add it to sqilte - -Since we add new information of the event, we add trace print and store it to -Sqlite. - -Signed-off-by: Luo Shengwei ---- - ras-arm-handler.c | 10 ++++++++++ - ras-record.c | 8 ++++++++ - 2 files changed, 18 insertions(+) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 10d0099..23ad470 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -23,6 +23,13 @@ - #include "ras-cpu-isolation.h" - - #ifdef HAVE_CPU_FAULT_ISOLATION -+static void trace_print_hex(struct trace_seq *s, const uint8_t *buf, int buf_len) -+{ -+ for (int i = 0; i < buf_len; ++i) { -+ trace_seq_printf(s, "%2.2x", buf[i]); -+ } -+} -+ - static int is_core_failure(unsigned long value) - { - /* -@@ -135,6 +142,7 @@ int ras_arm_event_handler(struct trace_seq *s, - case GHES_SEV_PANIC: - ev.severity = "Fatal"; - } -+ trace_seq_printf(s, "\n severity: %s", ev.severity); - - if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { - int len, nums; -@@ -142,6 +150,8 @@ int ras_arm_event_handler(struct trace_seq *s, - if (!ev.error_info) - return -1; - ev.length = len; -+ trace_seq_printf(s, "\n processor_err_info: "); -+ trace_print_hex(s, ev.error_info, len); - /* relate to enum error_type */ - nums = count_errors(event, ev.error_info, len); - if (nums > 0) { -diff --git a/ras-record.c b/ras-record.c -index 549c494..33d4741 100644 ---- a/ras-record.c -+++ b/ras-record.c -@@ -210,6 +210,10 @@ static const struct db_fields arm_event_fields[] = { - { .name="mpidr", .type="INTEGER" }, - { .name="running_state", .type="INTEGER" }, - { .name="psci_state", .type="INTEGER" }, -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ { .name="severity", .type="TEXT" }, -+ { .name="error_info", .type="BLOB" }, -+#endif - }; - - static const struct db_table_descriptor arm_event_tab = { -@@ -233,6 +237,10 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) - sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr); - sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state); - sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state); -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ sqlite3_bind_text (priv->stmt_arm_record, 7, ev->severity, -1, NULL); -+ sqlite3_bind_blob (priv->stmt_arm_record, 8, ev->error_info, ev->length, NULL); -+#endif - - rc = sqlite3_step(priv->stmt_arm_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) --- -2.27.0 - diff --git a/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch b/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch new file mode 100644 index 0000000000000000000000000000000000000000..6af2ad06985067634973a73ff66663a0619489a7 --- /dev/null +++ b/0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch @@ -0,0 +1,274 @@ +From 0643011831e5fb4e81edff16ad55f9a5196ec7a9 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Sat, 5 Mar 2022 18:19:38 +0000 +Subject: [PATCH 07/10] rasdaemon: ras-mc-ctl: Add support to display the + HiSilicon vendor errors for a specified module + +Add support to display the HiSilicon vendor errors for a specified module. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 145 +++++++++++++++++++++++++++------------------ + 1 file changed, 87 insertions(+), 58 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 959ea6b..296eb87 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -96,8 +96,9 @@ Usage: $prog [OPTIONS...] + --errors Shows the errors stored at the error database. + --error-count Shows the corrected and uncorrected error counts using sysfs. + --vendor-errors-summary Presents a summary of the vendor-specific logged errors. +- --vendor-errors Shows the vendor-specific errors stored in the error database. +- --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. ++ --vendor-errors Shows the vendor-specific errors stored in the error database. ++ --vendor-errors Shows the vendor-specific errors for a specific module stored in the error database. ++ --vendor-platforms List the supported platforms with platform-ids for the vendor-specific errors. + --help This help message. + EOF + +@@ -1535,12 +1536,14 @@ use constant { + sub vendor_errors_summary + { + require DBI; +- my ($num_args, $platform_id); ++ my ($num_args, $platform_id, $found_platform); + my ($query, $query_handle, $count, $out); + my ($module_id, $sub_module_id, $err_severity, $err_sev); + + $num_args = $#ARGV + 1; + $platform_id = 0; ++ $found_platform = 0; ++ + if ($num_args ne 0) { + $platform_id = $ARGV[0]; + } else { +@@ -1552,6 +1555,7 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng920 errors + if ($platform_id eq HISILICON_KUNPENG_920) { ++ $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1615,6 +1619,7 @@ sub vendor_errors_summary + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { ++ $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1636,21 +1641,31 @@ sub vendor_errors_summary + $query_handle->finish; + } + ++ if ($platform_id && !($found_platform)) { ++ print "Platform ID $platform_id is not valid\n"; ++ } ++ + undef($dbh); + } + + sub vendor_errors + { + require DBI; +- my ($num_args, $platform_id); ++ my ($num_args, $platform_id, $found_platform, $module, $found_module); + my ($query, $query_handle, $id, $timestamp, $out); + my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); + my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); + + $num_args = $#ARGV + 1; + $platform_id = 0; ++ $found_platform = 0; ++ $module = 0; ++ $found_module = 0; + if ($num_args ne 0) { + $platform_id = $ARGV[0]; ++ if ($num_args gt 1) { ++ $module = $ARGV[1]; ++ } + } else { + usage(1); + return; +@@ -1660,27 +1675,29 @@ sub vendor_errors + + # HiSilicon Kunpeng920 errors + if ($platform_id eq HISILICON_KUNPENG_920) { ++ $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs " if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; ++ $found_module = 1; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type1 errors.\n"; + } + $query_handle->finish; + +@@ -1690,21 +1707,22 @@ sub vendor_errors + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs " if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; ++ $found_module = 1; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type2 errors.\n"; + } + $query_handle->finish; + +@@ -1714,51 +1732,56 @@ sub vendor_errors + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "core_id=$core_id, " if ($core_id); +- $out .= "port_id=$port_id, " if ($port_id); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "err_type=$err_type, " if ($err_type); +- $out .= "Error Registers: $regs " if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "core_id=$core_id, " if ($core_id); ++ $out .= "port_id=$port_id, " if ($port_id); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "Error Registers: $regs " if ($regs); ++ $out .= "\n\n"; ++ $found_module = 1; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 PCIe controller errors.\n"; + } + $query_handle->finish; + } + + # HiSilicon Kunpeng9xx common errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { ++ $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); + $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); + $out = ""; + while($query_handle->fetch()) { +- $out .= "$id. $timestamp Error Info: "; +- $out .= "version=$version, "; +- $out .= "soc_id=$soc_id, " if ($soc_id); +- $out .= "socket_id=$socket_id, " if ($socket_id); +- $out .= "totem_id=$totem_id, " if ($totem_id); +- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); +- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); +- $out .= "module_id=$module_id, " if ($module_id); +- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); +- $out .= "core_id=$core_id, " if ($core_id); +- $out .= "port_id=$port_id, " if ($port_id); +- $out .= "err_type=$err_type, " if ($err_type); +- $out .= "pcie_info=$pcie_info, " if ($pcie_info); +- $out .= "err_severity=$err_severity, " if ($err_severity); +- $out .= "Error Registers: $regs" if ($regs); +- $out .= "\n\n"; ++ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { ++ $out .= "$id. $timestamp Error Info: "; ++ $out .= "version=$version, "; ++ $out .= "soc_id=$soc_id, " if ($soc_id); ++ $out .= "socket_id=$socket_id, " if ($socket_id); ++ $out .= "totem_id=$totem_id, " if ($totem_id); ++ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); ++ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); ++ $out .= "module_id=$module_id, " if ($module_id); ++ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); ++ $out .= "core_id=$core_id, " if ($core_id); ++ $out .= "port_id=$port_id, " if ($port_id); ++ $out .= "err_type=$err_type, " if ($err_type); ++ $out .= "pcie_info=$pcie_info, " if ($pcie_info); ++ $out .= "err_severity=$err_severity, " if ($err_severity); ++ $out .= "Error Registers: $regs" if ($regs); ++ $out .= "\n\n"; ++ $found_module = 1; ++ } + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +@@ -1768,6 +1791,12 @@ sub vendor_errors + $query_handle->finish; + } + ++ if ($platform_id && !($found_platform)) { ++ print "Platform ID $platform_id is not valid\n"; ++ } elsif ($module && !($found_module)) { ++ print "No error record for the module $module\n"; ++ } ++ + undef($dbh); + } + +-- +2.25.1 + diff --git a/0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch b/0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch deleted file mode 100644 index 38ef9ac9a09d3e268e744da4570884789ebd48fb..0000000000000000000000000000000000000000 --- a/0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 6b767a2fce615384f062ecb392cd332452bf4482 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Wed, 1 Sep 2021 21:00:16 +0800 -Subject: [PATCH] modify cpu parse for adapting to new bios version - ---- - ras-cpu-isolation.c | 20 ++++++++++++++++++-- - 1 file changed, 18 insertions(+), 2 deletions(-) - -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index 6dcff70..b1643c4 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -25,6 +25,7 @@ - - static struct cpu_info *cpu_infos = NULL; - static unsigned int ncores, cores_per_socket, cores_per_die; -+static unsigned int cores_per_cluster = 4; - static unsigned int sockets, dies = 1; - static unsigned int enabled = 1; - static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -@@ -432,18 +433,33 @@ static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size - - static unsigned get_cpu_index(int64_t mpidr) - { -- unsigned core_id, socket_id, die_id, cpu; -+ unsigned core_id, cluster_id, socket_id, die_id, cpu; - /* - * Adapt to certain BIOS - * In the MPIDR: - * bit 8:15: core id -+ * bit 16:18: cluster id - * bit 19:20: die_id - * bit 21:22: socket_id - */ - core_id = get_bit_value(mpidr, 8, 8); -+ cluster_id = get_bit_value(mpidr, 16, 3); - socket_id = get_bit_value(mpidr, 21, 2); - die_id = get_bit_value(mpidr, 19, 2); -- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die; -+ -+ /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3, -+ * it means TotemB. When cores per die equal to cores per socket, it means -+ * that there is only one die in the socket, in case that the only die is -+ * TotemB in CPU 1620s, we set die id to 0 directly. -+ */ -+ if (cores_per_die == cores_per_socket) { -+ die_id = 0; -+ } -+ else { -+ die_id = (die_id == 1 ? 0:1); -+ } -+ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die + -+ cluster_id * cores_per_cluster; - - return cpu; - } --- -2.27.0 - diff --git a/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch b/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch new file mode 100644 index 0000000000000000000000000000000000000000..0453e046cc97e34d6e0bff78d16b06fff2c21402 --- /dev/null +++ b/0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch @@ -0,0 +1,150 @@ +From 2f23b5dc6e5831c8ef2e179bb936e13502f75041 Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Mon, 7 Mar 2022 12:38:45 +0000 +Subject: [PATCH 08/10] rasdaemon: ras-mc-ctl: Relocate reading and display + Kunpeng920 errors to under Kunpeng9xx + +Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 40 ++++++++++------------------------------ + 1 file changed, 10 insertions(+), 30 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 296eb87..75981a0 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1529,7 +1529,6 @@ sub errors + + # Definitions of the vendor platform IDs. + use constant { +- HISILICON_KUNPENG_920 => "Kunpeng920", + HISILICON_KUNPENG_9XX => "Kunpeng9xx", + }; + +@@ -1553,8 +1552,8 @@ sub vendor_errors_summary + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng920 errors +- if ($platform_id eq HISILICON_KUNPENG_920) { ++ # HiSilicon Kunpeng9xx errors ++ if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); +@@ -1570,9 +1569,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n"; ++ print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1590,9 +1587,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n"; ++ print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1610,16 +1605,10 @@ sub vendor_errors_summary + $out .= "\t$sub_module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n"; ++ print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; + } + $query_handle->finish; +- } + +- # HiSilicon Kunpeng9xx common errors +- if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1635,8 +1624,6 @@ sub vendor_errors_summary + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng9xx common errors.\n\n"; + } + $query_handle->finish; + } +@@ -1673,8 +1660,8 @@ sub vendor_errors + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng920 errors +- if ($platform_id eq HISILICON_KUNPENG_920) { ++ # HiSilicon Kunpeng9xx errors ++ if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); +@@ -1697,7 +1684,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; ++ print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1722,7 +1709,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; ++ print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1749,14 +1736,10 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; ++ print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; + } + $query_handle->finish; +- } + +- # HiSilicon Kunpeng9xx common errors +- if ($platform_id eq HISILICON_KUNPENG_9XX) { +- $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +@@ -1785,8 +1768,6 @@ sub vendor_errors + } + if ($out ne "") { + print "HiSilicon Kunpeng9xx common error events:\n$out\n"; +- } else { +- print "No HiSilicon Kunpeng9xx common errors.\n"; + } + $query_handle->finish; + } +@@ -1803,7 +1784,6 @@ sub vendor_errors + sub vendor_platforms + { + print "\nSupported platforms for the vendor-specific errors:\n"; +- print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n"; + print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; + print "\n"; + } +-- +2.25.1 + diff --git a/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch b/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch new file mode 100644 index 0000000000000000000000000000000000000000..e34f89f21db44556fb17a663c653c37b7ad4e271 --- /dev/null +++ b/0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch @@ -0,0 +1,127 @@ +From df6011fed2bb45989f9e5c2ea30b33937b08d06c Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 28 Apr 2022 18:58:43 +0100 +Subject: [PATCH 09/10] rasdaemon: ras-mc-ctl: Updated HiSilicon platform name + +Updated the HiSilicon platform name as KunPeng9xx. + +Signed-off-by: Shiju Jose +--- + util/ras-mc-ctl.in | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 75981a0..1cc19b3 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1529,7 +1529,7 @@ sub errors + + # Definitions of the vendor platform IDs. + use constant { +- HISILICON_KUNPENG_9XX => "Kunpeng9xx", ++ HISILICON_KUNPENG_9XX => "KunPeng9xx", + }; + + sub vendor_errors_summary +@@ -1552,7 +1552,7 @@ sub vendor_errors_summary + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng9xx errors ++ # HiSilicon KunPeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; +@@ -1569,7 +1569,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type1 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1587,7 +1587,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type2 error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1605,7 +1605,7 @@ sub vendor_errors_summary + $out .= "\t$sub_module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx PCIe controller error events summary:\n$out\n"; + } + $query_handle->finish; + +@@ -1623,7 +1623,7 @@ sub vendor_errors_summary + $out .= "\t$module_id: $count\n"; + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; ++ print "HiSilicon KunPeng9xx common error events summary:\n$out\n"; + } + $query_handle->finish; + } +@@ -1660,7 +1660,7 @@ sub vendor_errors + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- # HiSilicon Kunpeng9xx errors ++ # HiSilicon KunPeng9xx errors + if ($platform_id eq HISILICON_KUNPENG_9XX) { + $found_platform = 1; + $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; +@@ -1684,7 +1684,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type1 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1709,7 +1709,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx OEM type2 error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1736,7 +1736,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx PCIe controller error events:\n$out\n"; + } + $query_handle->finish; + +@@ -1767,7 +1767,7 @@ sub vendor_errors + } + } + if ($out ne "") { +- print "HiSilicon Kunpeng9xx common error events:\n$out\n"; ++ print "HiSilicon KunPeng9xx common error events:\n$out\n"; + } + $query_handle->finish; + } +@@ -1784,7 +1784,7 @@ sub vendor_errors + sub vendor_platforms + { + print "\nSupported platforms for the vendor-specific errors:\n"; +- print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; ++ print "\tHiSilicon KunPeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; + print "\n"; + } + +-- +2.25.1 + diff --git a/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch b/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch new file mode 100644 index 0000000000000000000000000000000000000000..48a62cc0ba5d90bb57add7ed778326d5f91d10d8 --- /dev/null +++ b/0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch @@ -0,0 +1,90 @@ +From c019f2f82b7f224e95968037f2afc16f63cc1d1d Mon Sep 17 00:00:00 2001 +From: Shiju Jose +Date: Thu, 28 Apr 2022 22:59:04 +0100 +Subject: [PATCH 10/10] rasdaemon: Fix for a memory out-of-bounds issue and + optimized code to remove duplicate function. + +Fixed a memory out-of-bounds issue with string pointers and +optimized code structure to remove duplicate function. + +Signed-off-by: Lei Feng +Signed-off-by: Shiju Jose +--- + non-standard-hisi_hip08.c | 6 +++--- + non-standard-hisilicon.c | 2 +- + ras-non-standard-handler.c | 16 +--------------- + 3 files changed, 5 insertions(+), 19 deletions(-) + +diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c +index 9092183..4ef47ea 100644 +--- a/non-standard-hisi_hip08.c ++++ b/non-standard-hisi_hip08.c +@@ -1014,15 +1014,15 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, + + static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { + { +- .sec_type = "1f8161e155d641e6bd107afd1dc5f7c5", ++ .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5", + .decode = decode_hip08_oem_type1_error, + }, + { +- .sec_type = "45534ea6ce2341158535e07ab3aef91d", ++ .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d", + .decode = decode_hip08_oem_type2_error, + }, + { +- .sec_type = "b2889fc9e7d74f9da867af42e98be772", ++ .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772", + .decode = decode_hip08_pcie_local_error, + }, + }; +diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c +index d1e1774..6ee9271 100644 +--- a/non-standard-hisilicon.c ++++ b/non-standard-hisilicon.c +@@ -387,7 +387,7 @@ static int decode_hisi_common_section(struct ras_events *ras, + + static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { + { +- .sec_type = "c8b328a899174af69a132e08ab2e7586", ++ .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586", + .decode = decode_hisi_common_section, + }, + }; +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 6d5a6f8..6932e58 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -52,20 +52,6 @@ static char *uuid_le(const char *uu) + return uuid; + } + +-static int uuid_le_cmp(const char *sec_type, const char *uuid2) +-{ +- static char uuid1[32]; +- char *p = uuid1; +- int i; +- static const unsigned char le[16] = { +- 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; +- +- for (i = 0; i < 16; i++) +- p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); +- *p = 0; +- return strncmp(uuid1, uuid2, 32); +-} +- + int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) + { + struct ras_ns_ev_decoder *list; +@@ -96,7 +82,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p + + ns_ev_decoder = ras_ns_ev_dec_list; + while (ns_ev_decoder) { +- if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) { ++ if (strcmp(uuid_le(sec_type), ns_ev_decoder->sec_type) == 0) { + *p_ns_ev_dec = ns_ev_decoder; + match = 1; + break; +-- +2.25.1 + diff --git a/backport-0001-rasdaemon-Modify-non-standard-error-decoding-interfa.patch b/backport-0001-rasdaemon-Modify-non-standard-error-decoding-interfa.patch deleted file mode 100644 index 2d864086f9388c86eb15ced419dfc36115395c87..0000000000000000000000000000000000000000 --- a/backport-0001-rasdaemon-Modify-non-standard-error-decoding-interfa.patch +++ /dev/null @@ -1,785 +0,0 @@ -From 1c085f983f01ec09e5b0dd67dbb8b4afa89e7300 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Mon, 10 Aug 2020 15:42:56 +0100 -Subject: [PATCH] rasdaemon: Modify non-standard error decoding interface using - linked list - -Replace the current non-standard error decoding interface with the -interface based on the linked list to avoid using realloc and -to improve the interface. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisi_hip08.c | 114 +++++++++++++++++----------------- - non-standard-hisilicon.c | 46 +++++++------- - non-standard-hisilicon.h | 4 +- - ras-non-standard-handler.c | 122 ++++++++++++++++++++----------------- - ras-non-standard-handler.h | 13 ++-- - 5 files changed, 155 insertions(+), 144 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 2197f81..ebf03e1 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -528,7 +528,7 @@ static const struct db_table_descriptor hip08_pcie_local_event_tab = { - #endif - - #define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end)) --static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab, -+static void decode_oem_type1_err_hdr(struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - const struct hisi_oem_type1_err_sec *err) - { -@@ -537,26 +537,26 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab, - char *end = buf + HISI_BUF_LEN; - - p += snprintf(p, end - p, "[ table_version=%d ", err->version); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE1_FIELD_VERSION, err->version, NULL); - - if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE1_FIELD_SOC_ID, - err->soc_id, NULL); - } - - if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE1_FIELD_SOCKET_ID, - err->socket_id, NULL); - } - - if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE1_FIELD_NIMBUS_ID, - err->nimbus_id, NULL); - } -@@ -566,7 +566,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab, - err->module_id); - - p += snprintf(p, end - p, "module=%s ", str); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE1_FIELD_MODULE_ID, - 0, str); - } -@@ -578,7 +578,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab, - err->sub_module_id); - - p += snprintf(p, end - p, "submodule=%s ", str); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE1_FIELD_SUB_MODULE_ID, - 0, str); - } -@@ -587,7 +587,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab, - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "error_severity=%s ", - err_severity(err->err_severity)); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE1_FIELD_ERR_SEV, - 0, err_severity(err->err_severity)); - } -@@ -598,7 +598,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab, - trace_seq_printf(s, "%s\n", buf); - } - --static void decode_oem_type1_err_regs(struct ras_ns_dec_tab *dec_tab, -+static void decode_oem_type1_err_regs(struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - const struct hisi_oem_type1_err_sec *err) - { -@@ -649,14 +649,14 @@ static void decode_oem_type1_err_regs(struct ras_ns_dec_tab *dec_tab, - *p = '\0'; - } - -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE1_FIELD_REGS_DUMP, 0, buf); -- step_vendor_data_tab(dec_tab, "hip08_oem_type1_event_tab"); -+ step_vendor_data_tab(ev_decoder, "hip08_oem_type1_event_tab"); - } - - /* error data decoding functions */ - static int decode_hip08_oem_type1_error(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -+ struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - struct ras_non_standard_event *event) - { -@@ -670,8 +670,8 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras, - } - - #ifdef HAVE_SQLITE3 -- if (!dec_tab->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, -+ if (!ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &hip08_oem_type1_event_tab) - != SQLITE_OK) { - trace_seq_printf(s, -@@ -680,18 +680,18 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras, - } - } - #endif -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE1_FIELD_TIMESTAMP, - 0, event->timestamp); - - trace_seq_printf(s, "\nHISI HIP08: OEM Type-1 Error\n"); -- decode_oem_type1_err_hdr(dec_tab, s, err); -- decode_oem_type1_err_regs(dec_tab, s, err); -+ decode_oem_type1_err_hdr(ev_decoder, s, err); -+ decode_oem_type1_err_regs(ev_decoder, s, err); - - return 0; - } - --static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab, -+static void decode_oem_type2_err_hdr(struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - const struct hisi_oem_type2_err_sec *err) - { -@@ -700,26 +700,26 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab, - char *end = buf + HISI_BUF_LEN; - - p += snprintf(p, end - p, "[ table_version=%d ", err->version); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE2_FIELD_VERSION, err->version, NULL); - - if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE2_FIELD_SOC_ID, - err->soc_id, NULL); - } - - if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE2_FIELD_SOCKET_ID, - err->socket_id, NULL); - } - - if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_OEM_TYPE2_FIELD_NIMBUS_ID, - err->nimbus_id, NULL); - } -@@ -729,7 +729,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab, - err->module_id); - - p += snprintf(p, end - p, "module=%s ", str); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE2_FIELD_MODULE_ID, - 0, str); - } -@@ -741,7 +741,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab, - err->sub_module_id); - - p += snprintf(p, end - p, "submodule=%s ", str); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE2_FIELD_SUB_MODULE_ID, - 0, str); - } -@@ -750,7 +750,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab, - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "error_severity=%s ", - err_severity(err->err_severity)); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE2_FIELD_ERR_SEV, - 0, err_severity(err->err_severity)); - } -@@ -761,7 +761,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab, - trace_seq_printf(s, "%s\n", buf); - } - --static void decode_oem_type2_err_regs(struct ras_ns_dec_tab *dec_tab, -+static void decode_oem_type2_err_regs(struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - const struct hisi_oem_type2_err_sec *err) - { -@@ -822,13 +822,13 @@ static void decode_oem_type2_err_regs(struct ras_ns_dec_tab *dec_tab, - *p = '\0'; - } - -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE2_FIELD_REGS_DUMP, 0, buf); -- step_vendor_data_tab(dec_tab, "hip08_oem_type2_event_tab"); -+ step_vendor_data_tab(ev_decoder, "hip08_oem_type2_event_tab"); - } - - static int decode_hip08_oem_type2_error(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -+ struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - struct ras_non_standard_event *event) - { -@@ -842,8 +842,8 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras, - } - - #ifdef HAVE_SQLITE3 -- if (!dec_tab->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, -+ if (!ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &hip08_oem_type2_event_tab) != SQLITE_OK) { - trace_seq_printf(s, - "create sql hip08_oem_type2_event_tab fail\n"); -@@ -851,18 +851,18 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras, - } - } - #endif -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE2_FIELD_TIMESTAMP, - 0, event->timestamp); - - trace_seq_printf(s, "\nHISI HIP08: OEM Type-2 Error\n"); -- decode_oem_type2_err_hdr(dec_tab, s, err); -- decode_oem_type2_err_regs(dec_tab, s, err); -+ decode_oem_type2_err_hdr(ev_decoder, s, err); -+ decode_oem_type2_err_regs(ev_decoder, s, err); - - return 0; - } - --static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, -+static void decode_pcie_local_err_hdr(struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - const struct hisi_pcie_local_err_sec *err) - { -@@ -871,14 +871,14 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - char *end = buf + HISI_BUF_LEN; - - p += snprintf(p, end - p, "[ table_version=%d ", err->version); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_PCIE_LOCAL_FIELD_VERSION, - err->version, NULL); - - if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOC_ID && - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_PCIE_LOCAL_FIELD_SOC_ID, - err->soc_id, NULL); - } -@@ -886,7 +886,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOCKET_ID && - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_PCIE_LOCAL_FIELD_SOCKET_ID, - err->socket_id, NULL); - } -@@ -894,7 +894,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - if (err->val_bits & HISI_PCIE_LOCAL_VALID_NIMBUS_ID && - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_PCIE_LOCAL_FIELD_NIMBUS_ID, - err->nimbus_id, NULL); - } -@@ -903,7 +903,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "submodule=%s ", - pcie_local_sub_module_name(err->sub_module_id)); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_PCIE_LOCAL_FIELD_SUB_MODULE_ID, - 0, pcie_local_sub_module_name(err->sub_module_id)); - } -@@ -911,7 +911,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - if (err->val_bits & HISI_PCIE_LOCAL_VALID_CORE_ID && - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "core_ID=core%d ", err->core_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_PCIE_LOCAL_FIELD_CORE_ID, - err->core_id, NULL); - } -@@ -919,7 +919,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - if (err->val_bits & HISI_PCIE_LOCAL_VALID_PORT_ID && - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "port_ID=port%d ", err->port_id); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_PCIE_LOCAL_FIELD_PORT_ID, - err->port_id, NULL); - } -@@ -928,7 +928,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "error_severity=%s ", - err_severity(err->err_severity)); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_PCIE_LOCAL_FIELD_ERR_SEV, - 0, err_severity(err->err_severity)); - } -@@ -936,7 +936,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - if (err->val_bits & HISI_PCIE_LOCAL_VALID_ERR_TYPE && - IN_RANGE(p, buf, end)) { - p += snprintf(p, end - p, "error_type=0x%x ", err->err_type); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, - HIP08_PCIE_LOCAL_FIELD_ERR_TYPE, - err->err_type, NULL); - } -@@ -947,7 +947,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab, - trace_seq_printf(s, "%s\n", buf); - } - --static void decode_pcie_local_err_regs(struct ras_ns_dec_tab *dec_tab, -+static void decode_pcie_local_err_regs(struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - const struct hisi_pcie_local_err_sec *err) - { -@@ -972,13 +972,13 @@ static void decode_pcie_local_err_regs(struct ras_ns_dec_tab *dec_tab, - *p = '\0'; - } - -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_PCIE_LOCAL_FIELD_REGS_DUMP, 0, buf); -- step_vendor_data_tab(dec_tab, "hip08_pcie_local_event_tab"); -+ step_vendor_data_tab(ev_decoder, "hip08_pcie_local_event_tab"); - } - - static int decode_hip08_pcie_local_error(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -+ struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - struct ras_non_standard_event *event) - { -@@ -992,8 +992,8 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, - } - - #ifdef HAVE_SQLITE3 -- if (!dec_tab->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, -+ if (!ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &hip08_pcie_local_event_tab) != SQLITE_OK) { - trace_seq_printf(s, - "create sql hip08_pcie_local_event_tab fail\n"); -@@ -1001,18 +1001,18 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, - } - } - #endif -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_PCIE_LOCAL_FIELD_TIMESTAMP, - 0, event->timestamp); - - trace_seq_printf(s, "\nHISI HIP08: PCIe local error\n"); -- decode_pcie_local_err_hdr(dec_tab, s, err); -- decode_pcie_local_err_regs(dec_tab, s, err); -+ decode_pcie_local_err_hdr(ev_decoder, s, err); -+ decode_pcie_local_err_regs(ev_decoder, s, err); - - return 0; - } - --struct ras_ns_dec_tab hip08_ns_oem_tab[] = { -+static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { - { - .sec_type = "1f8161e155d641e6bd107afd1dc5f7c5", - .decode = decode_hip08_oem_type1_error, -@@ -1025,10 +1025,12 @@ struct ras_ns_dec_tab hip08_ns_oem_tab[] = { - .sec_type = "b2889fc9e7d74f9da867af42e98be772", - .decode = decode_hip08_pcie_local_error, - }, -- { /* sentinel */ } - }; - - static void __attribute__((constructor)) hip08_init(void) - { -- register_ns_dec_tab(hip08_ns_oem_tab); -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(hip08_ns_ev_decoder); i++) -+ register_ns_ev_decoder(&hip08_ns_ev_decoder[i]); - } -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index c9e1fa9..a6f5e78 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -73,38 +73,38 @@ struct hisi_event { - }; - - #ifdef HAVE_SQLITE3 --void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder, - enum hisi_oem_data_type data_type, - int id, int64_t data, const char *text) - { - switch (data_type) { - case HISI_OEM_DATA_TYPE_INT: -- sqlite3_bind_int(dec_tab->stmt_dec_record, id, data); -+ sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data); - break; - case HISI_OEM_DATA_TYPE_INT64: -- sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data); -+ sqlite3_bind_int64(ev_decoder->stmt_dec_record, id, data); - break; - case HISI_OEM_DATA_TYPE_TEXT: -- sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL); -+ sqlite3_bind_text(ev_decoder->stmt_dec_record, id, text, -1, NULL); - break; - } - } - --int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name) -+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) - { - int rc; - -- rc = sqlite3_step(dec_tab->stmt_dec_record); -+ rc = sqlite3_step(ev_decoder->stmt_dec_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) - log(TERM, LOG_ERR, - "Failed to do %s step on sqlite: error = %d\n", name, rc); - -- rc = sqlite3_reset(dec_tab->stmt_dec_record); -+ rc = sqlite3_reset(ev_decoder->stmt_dec_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) - log(TERM, LOG_ERR, - "Failed to reset %s on sqlite: error = %d\n", name, rc); - -- rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record); -+ rc = sqlite3_clear_bindings(ev_decoder->stmt_dec_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) - log(TERM, LOG_ERR, - "Failed to clear bindings %s on sqlite: error = %d\n", -@@ -113,12 +113,12 @@ int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name) - return rc; - } - #else --void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder, - enum hisi_oem_data_type data_type, - int id, int64_t data, const char *text) - { } - --int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name) -+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) - { - return 0; - } -@@ -197,7 +197,7 @@ static void decode_module(struct hisi_event *event, uint8_t module_id) - HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); - } - --static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab, -+static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, - const struct hisi_common_error_section *err, - struct hisi_event *event) - { -@@ -244,7 +244,7 @@ static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab, - } - - static int decode_hisi_common_section(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -+ struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, - struct ras_non_standard_event *event) - { -@@ -253,8 +253,8 @@ static int decode_hisi_common_section(struct ras_events *ras, - struct hisi_event hevent; - - #ifdef HAVE_SQLITE3 -- if (ras->record_events && !dec_tab->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &hisi_common_section_tab) != SQLITE_OK) { - trace_seq_printf(s, "create sql hisi_common_section_tab fail\n"); - return -1; -@@ -264,7 +264,7 @@ static int decode_hisi_common_section(struct ras_events *ras, - - memset(&hevent, 0, sizeof(struct hisi_event)); - trace_seq_printf(s, "\nHisilicon Common Error Section:\n"); -- decode_hisi_common_section_hdr(dec_tab, err, &hevent); -+ decode_hisi_common_section_hdr(ev_decoder, err, &hevent); - trace_seq_printf(s, "%s\n", hevent.error_msg); - - if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) { -@@ -280,28 +280,30 @@ static int decode_hisi_common_section(struct ras_events *ras, - } - - if (ras->record_events) { -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_TIMESTAMP, - 0, event->timestamp); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg); -- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg); -- step_vendor_data_tab(dec_tab, "hisi_common_section_tab"); -+ step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); - } - - return 0; - } - --struct ras_ns_dec_tab hisi_section_ns_tab[] = { -+static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { - { - .sec_type = "c8b328a899174af69a132e08ab2e7586", - .decode = decode_hisi_common_section, - }, -- { /* sentinel */ } - }; - - static void __attribute__((constructor)) hisi_ns_init(void) - { -- register_ns_dec_tab(hisi_section_ns_tab); -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++) -+ register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]); - } -diff --git a/non-standard-hisilicon.h b/non-standard-hisilicon.h -index 1ce210a..75b911e 100644 ---- a/non-standard-hisilicon.h -+++ b/non-standard-hisilicon.h -@@ -41,9 +41,9 @@ static inline char *err_severity(uint8_t err_sev) - return "unknown"; - } - --void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder, - enum hisi_oem_data_type data_type, - int id, int64_t data, const char *text); --int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name); -+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name); - - #endif -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index d92fd42..1862335 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -22,46 +22,7 @@ - #include "ras-logger.h" - #include "ras-report.h" - --static p_ns_dec_tab * ns_dec_tab; --static size_t dec_tab_count; -- --int register_ns_dec_tab(const p_ns_dec_tab tab) --{ -- ns_dec_tab = (p_ns_dec_tab *)realloc(ns_dec_tab, -- (dec_tab_count + 1) * sizeof(tab)); -- if (ns_dec_tab == NULL) { -- printf("%s p_ns_dec_tab malloc failed", __func__); -- return -1; -- } -- ns_dec_tab[dec_tab_count] = tab; -- dec_tab_count++; -- return 0; --} -- --void unregister_ns_dec_tab(void) --{ -- if (ns_dec_tab) { --#ifdef HAVE_SQLITE3 -- p_ns_dec_tab dec_tab; -- int i, count; -- -- for (count = 0; count < dec_tab_count; count++) { -- dec_tab = ns_dec_tab[count]; -- for (i = 0; dec_tab[i].decode; i++) { -- if (dec_tab[i].stmt_dec_record) { -- ras_mc_finalize_vendor_table( -- dec_tab[i].stmt_dec_record); -- dec_tab[i].stmt_dec_record = NULL; -- } -- } -- } --#endif -- -- free(ns_dec_tab); -- ns_dec_tab = NULL; -- dec_tab_count = 0; -- } --} -+static struct ras_ns_ev_decoder *ras_ns_ev_dec_list; - - void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) { - trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]); -@@ -105,18 +66,75 @@ static int uuid_le_cmp(const char *sec_type, const char *uuid2) - return strncmp(uuid1, uuid2, 32); - } - -+int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) -+{ -+ struct ras_ns_ev_decoder *list; -+ -+ if (!ns_ev_decoder) -+ return -1; -+ -+ ns_ev_decoder->next = NULL; -+ ns_ev_decoder->stmt_dec_record = NULL; -+ if (!ras_ns_ev_dec_list) { -+ ras_ns_ev_dec_list = ns_ev_decoder; -+ } else { -+ list = ras_ns_ev_dec_list; -+ while (list->next) -+ list = list->next; -+ list->next = ns_ev_decoder; -+ } -+ -+ return 0; -+} -+ -+static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p_ns_ev_dec) -+{ -+ struct ras_ns_ev_decoder *ns_ev_decoder; -+ int match = 0; -+ -+ ns_ev_decoder = ras_ns_ev_dec_list; -+ while (ns_ev_decoder) { -+ if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) { -+ *p_ns_ev_dec = ns_ev_decoder; -+ match = 1; -+ break; -+ } -+ ns_ev_decoder = ns_ev_decoder->next; -+ } -+ -+ if (!match) -+ return -1; -+ -+ return 0; -+} -+ -+static void unregister_ns_ev_decoder(void) -+{ -+#ifdef HAVE_SQLITE3 -+ struct ras_ns_ev_decoder *ns_ev_decoder = ras_ns_ev_dec_list; -+ -+ while (ns_ev_decoder) { -+ if (ns_ev_decoder->stmt_dec_record) { -+ ras_mc_finalize_vendor_table(ns_ev_decoder->stmt_dec_record); -+ ns_ev_decoder->stmt_dec_record = NULL; -+ } -+ ns_ev_decoder = ns_ev_decoder->next; -+ } -+#endif -+ ras_ns_ev_dec_list = NULL; -+} -+ - int ras_non_standard_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context) - { -- int len, i, line_count, count; -+ int len, i, line_count; - unsigned long long val; - struct ras_events *ras = context; - time_t now; - struct tm *tm; - struct ras_non_standard_event ev; -- p_ns_dec_tab dec_tab; -- bool dec_done = false; -+ struct ras_ns_ev_decoder *ns_ev_decoder; - - /* - * Newer kernels (3.10-rc1 or upper) provide an uptime clock. -@@ -177,19 +195,9 @@ int ras_non_standard_event_handler(struct trace_seq *s, - if(!ev.error) - return -1; - -- for (count = 0; count < dec_tab_count && !dec_done; count++) { -- dec_tab = ns_dec_tab[count]; -- for (i = 0; dec_tab[i].decode; i++) { -- if (uuid_le_cmp(ev.sec_type, -- dec_tab[i].sec_type) == 0) { -- dec_tab[i].decode(ras, &dec_tab[i], s, &ev); -- dec_done = true; -- break; -- } -- } -- } -- -- if (!dec_done) { -+ if (!find_ns_ev_decoder(ev.sec_type, &ns_ev_decoder)) { -+ ns_ev_decoder->decode(ras, ns_ev_decoder, s, &ev); -+ } else { - len = ev.length; - i = 0; - line_count = 0; -@@ -222,5 +230,5 @@ int ras_non_standard_event_handler(struct trace_seq *s, - __attribute__((destructor)) - static void ns_exit(void) - { -- unregister_ns_dec_tab(); -+ unregister_ns_ev_decoder(); - } -diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h -index 2b9bf40..57d4cb5 100644 ---- a/ras-non-standard-handler.h -+++ b/ras-non-standard-handler.h -@@ -20,15 +20,16 @@ - #define BIT(nr) (1UL << (nr)) - #define BIT_ULL(nr) (1ULL << (nr)) - --typedef struct ras_ns_dec_tab { -+struct ras_ns_ev_decoder { -+ struct ras_ns_ev_decoder *next; - const char *sec_type; -- int (*decode)(struct ras_events *ras, struct ras_ns_dec_tab *dec_tab, -+ int (*decode)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, struct ras_non_standard_event *event); - #ifdef HAVE_SQLITE3 - #include - sqlite3_stmt *stmt_dec_record; - #endif --} *p_ns_dec_tab; -+}; - - int ras_non_standard_event_handler(struct trace_seq *s, - struct pevent_record *record, -@@ -37,11 +38,9 @@ int ras_non_standard_event_handler(struct trace_seq *s, - void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index); - - #ifdef HAVE_NON_STANDARD --int register_ns_dec_tab(const p_ns_dec_tab tab); --void unregister_ns_dec_tab(void); -+int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder); - #else --static inline int register_ns_dec_tab(const p_ns_dec_tab tab) { return 0; }; --static inline void unregister_ns_dec_tab(void) { return; }; -+static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; }; - #endif - - #endif --- -2.33.0 - diff --git a/backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch b/backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch deleted file mode 100644 index b6aba574f8dcf77474e199026cda960d1f8b9e13..0000000000000000000000000000000000000000 --- a/backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch +++ /dev/null @@ -1,63 +0,0 @@ -From b98880e2cf5fd15e4261676760b719963b956a0e Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 27 Jul 2020 15:38:37 +0800 -Subject: [PATCH 1/3] rasdaemon: delete the duplicate code about the definition - of hip08 DB fields - -Delete the duplicate code about the definition of DB fields for hip08 OEM -event format1 and format2. Because the two OEM event format is the same. - -Signed-off-By: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisi_hip08.c | 23 +++++------------------ - 1 file changed, 5 insertions(+), 18 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 8bf10c1..7fc6939 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -504,7 +504,7 @@ static char *pcie_local_sub_module_name(uint8_t id) - } - - #ifdef HAVE_SQLITE3 --static const struct db_fields hip08_oem_type1_event_fields[] = { -+static const struct db_fields hip08_oem_event_fields[] = { - { .name = "id", .type = "INTEGER PRIMARY KEY" }, - { .name = "timestamp", .type = "TEXT" }, - { .name = "version", .type = "INTEGER" }, -@@ -519,27 +519,14 @@ static const struct db_fields hip08_oem_type1_event_fields[] = { - - static const struct db_table_descriptor hip08_oem_type1_event_tab = { - .name = "hip08_oem_type1_event_v2", -- .fields = hip08_oem_type1_event_fields, -- .num_fields = ARRAY_SIZE(hip08_oem_type1_event_fields), --}; -- --static const struct db_fields hip08_oem_type2_event_fields[] = { -- { .name = "id", .type = "INTEGER PRIMARY KEY" }, -- { .name = "timestamp", .type = "TEXT" }, -- { .name = "version", .type = "INTEGER" }, -- { .name = "soc_id", .type = "INTEGER" }, -- { .name = "socket_id", .type = "INTEGER" }, -- { .name = "nimbus_id", .type = "INTEGER" }, -- { .name = "module_id", .type = "TEXT" }, -- { .name = "sub_module_id", .type = "TEXT" }, -- { .name = "err_severity", .type = "TEXT" }, -- { .name = "regs_dump", .type = "TEXT" }, -+ .fields = hip08_oem_event_fields, -+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields), - }; - - static const struct db_table_descriptor hip08_oem_type2_event_tab = { - .name = "hip08_oem_type2_event_v2", -- .fields = hip08_oem_type2_event_fields, -- .num_fields = ARRAY_SIZE(hip08_oem_type2_event_fields), -+ .fields = hip08_oem_event_fields, -+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields), - }; - - static const struct db_fields hip08_pcie_local_event_fields[] = { --- -2.7.4 - diff --git a/backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch b/backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch deleted file mode 100644 index 3a22ead14ee8f2177da3d4c5918ac18e338cb0bf..0000000000000000000000000000000000000000 --- a/backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch +++ /dev/null @@ -1,190 +0,0 @@ -From 6ee76565274f31052868e970bce8768c314f6bb7 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 27 Jul 2020 15:38:38 +0800 -Subject: [PATCH 2/3] rasdaemon: delete the code of non-standard error decoder - for hip07 - -Delete the code of non-standard error decoder for hip07 that was never -used. Because the corresponding code in Linux kernel wasn't accepted. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - Makefile.am | 2 +- - non-standard-hisi_hip07.c | 151 ---------------------------------------------- - 2 files changed, 1 insertion(+), 152 deletions(-) - delete mode 100644 non-standard-hisi_hip07.c - -diff --git a/Makefile.am b/Makefile.am -index 51ef4de..23b4d60 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT - rasdaemon_SOURCES += ras-report.c - endif - if WITH_HISI_NS_DECODE -- rasdaemon_SOURCES += non-standard-hisi_hip07.c non-standard-hisi_hip08.c -+ rasdaemon_SOURCES += non-standard-hisi_hip08.c - endif - if WITH_MEMORY_CE_PFA - rasdaemon_SOURCES += rbtree.c ras-page-isolation.c -diff --git a/non-standard-hisi_hip07.c b/non-standard-hisi_hip07.c -deleted file mode 100644 -index 09ddcb2..0000000 ---- a/non-standard-hisi_hip07.c -+++ /dev/null -@@ -1,151 +0,0 @@ --/* -- * Copyright (c) 2017 Hisilicon Limited. -- * -- * This program is free software; you can redistribute it and/or modify -- * it under the terms of the GNU General Public License as published by -- * the Free Software Foundation; either version 2 of the License, or -- * (at your option) any later version. -- * -- */ -- --#include --#include --#include --#include "ras-record.h" --#include "ras-logger.h" --#include "ras-report.h" --#include "ras-non-standard-handler.h" -- --/* common definitions */ -- --/* HISI SAS definitions */ --#define HISI_SAS_VALID_PA BIT(0) --#define HISI_SAS_VALID_MB_ERR BIT(1) --#define HISI_SAS_VALID_ERR_TYPE BIT(2) --#define HISI_SAS_VALID_AXI_ERR_INFO BIT(3) -- --struct hisi_sas_err_sec { -- uint64_t val_bits; -- uint64_t physical_addr; -- uint32_t mb; -- uint32_t type; -- uint32_t axi_err_info; --}; -- --/* Common Functions */ --static char *err_bit_type(int etype) --{ -- switch (etype) { -- case 0x0: return "single-bit ecc"; -- case 0x1: return "multi-bit ecc"; -- } -- return "unknown error"; --} -- --/* SAS Functions */ --static char *sas_err_type(int etype) --{ -- switch (etype) { -- case 0x0001: return "hgc_dqe ecc"; -- case 0x0002: return "hgc_iost ecc"; -- case 0x0004: return "hgc_itct ecc"; -- case 0x0008: return "hgc_iostl ecc"; -- case 0x0010: return "hgc_itctl ecc"; -- case 0x0020: return "hgc_cqe ecc"; -- case 0x0040: return "rxm_mem0 ecc"; -- case 0x0080: return "rxm_mem1 ecc"; -- case 0x0100: return "rxm_mem2 ecc"; -- case 0x0200: return "rxm_mem3 ecc"; -- case 0x0400: return "wp_depth"; -- case 0x0800: return "iptt_slot_no_match"; -- case 0x1000: return "rp_depth"; -- case 0x2000: return "axi err"; -- case 0x4000: return "fifo err"; -- case 0x8000: return "lm_add_fetch_list"; -- case 0x10000: return "hgc_abt_fetch_lm"; -- } -- return "unknown error"; --} -- --static char *sas_axi_err_type(int etype) --{ -- switch (etype) { -- case 0x0001: return "IOST_AXI_W_ERR"; -- case 0x0002: return "IOST_AXI_R_ERR"; -- case 0x0004: return "ITCT_AXI_W_ERR"; -- case 0x0008: return "ITCT_AXI_R_ERR"; -- case 0x0010: return "SATA_AXI_W_ERR"; -- case 0x0020: return "SATA_AXI_R_ERR"; -- case 0x0040: return "DQE_AXI_R_ERR"; -- case 0x0080: return "CQE_AXI_W_ERR"; -- case 0x0100: return "CQE_WINFO_FIFO"; -- case 0x0200: return "CQE_MSG_FIFIO"; -- case 0x0400: return "GETDQE_FIFO"; -- case 0x0800: return "CMDP_FIFO"; -- case 0x1000: return "AWTCTRL_FIFO"; -- } -- return "unknown error"; --} -- --static int decode_hip07_sas_error(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -- struct trace_seq *s, -- struct ras_non_standard_event *event) --{ -- char buf[1024]; -- char *p = buf; -- const struct hisi_sas_err_sec *err = -- (struct hisi_sas_err_sec *)event->error; -- -- if (err->val_bits == 0) { -- trace_seq_printf(s, "%s: no valid error data\n", -- __func__); -- return -1; -- } -- p += sprintf(p, "["); -- if (err->val_bits & HISI_SAS_VALID_PA) -- p += sprintf(p, "phy addr = 0x%p: ", -- (void *)err->physical_addr); -- -- if (err->val_bits & HISI_SAS_VALID_MB_ERR) -- p += sprintf(p, "%s: ", err_bit_type(err->mb)); -- -- if (err->val_bits & HISI_SAS_VALID_ERR_TYPE) -- p += sprintf(p, "error type = %s: ", -- sas_err_type(err->type)); -- -- if (err->val_bits & HISI_SAS_VALID_AXI_ERR_INFO) -- p += sprintf(p, "axi error type = %s", -- sas_axi_err_type(err->axi_err_info)); -- -- p += sprintf(p, "]"); -- -- trace_seq_printf(s, "\nHISI HIP07: SAS error: %s\n", buf); -- return 0; --} -- --static int decode_hip07_hns_error(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -- struct trace_seq *s, -- struct ras_non_standard_event *event) --{ -- return 0; --} -- --struct ras_ns_dec_tab hisi_ns_dec_tab[] = { -- { -- .sec_type = "daffd8146eba4d8c8a91bc9bbf4aa301", -- .decode = decode_hip07_sas_error, -- }, -- { -- .sec_type = "fbc2d923ea7a453dab132949f5af9e53", -- .decode = decode_hip07_hns_error, -- }, -- { /* sentinel */ } --}; -- --__attribute__((constructor)) --static void hip07_init(void) --{ -- register_ns_dec_tab(hisi_ns_dec_tab); --} --- -2.7.4 - diff --git a/backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch b/backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch deleted file mode 100644 index 7eaa3f38ddc5d7ba1a3ce25b12cf0b03bdaaef5e..0000000000000000000000000000000000000000 --- a/backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch +++ /dev/null @@ -1,527 +0,0 @@ -From 8c30a852493a6204ded59872bb3a0f0e43537713 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 27 Jul 2020 15:38:39 +0800 -Subject: [PATCH 3/3] rasdaemon: add support for hisilicon common section - decoder - -Add a new non-standard error section, Hisilicon common section. -It is defined for the next generation SoC Kunpeng930. It also supports -Kunpeng920 and some modules of Kunpeng920 could be changed to use -this section. - -We put the code to an new source file, as it supports multiple Hardware -platform. Some code of hip08 could be shared. Move them to this new file. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - Makefile.am | 2 +- - non-standard-hisi_hip08.c | 79 +----------- - non-standard-hisilicon.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++ - non-standard-hisilicon.h | 49 ++++++++ - 4 files changed, 358 insertions(+), 79 deletions(-) - create mode 100644 non-standard-hisilicon.c - create mode 100644 non-standard-hisilicon.h - -diff --git a/Makefile.am b/Makefile.am -index 23b4d60..18d1a92 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT - rasdaemon_SOURCES += ras-report.c - endif - if WITH_HISI_NS_DECODE -- rasdaemon_SOURCES += non-standard-hisi_hip08.c -+ rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c - endif - if WITH_MEMORY_CE_PFA - rasdaemon_SOURCES += rbtree.c ras-page-isolation.c -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 7fc6939..2197f81 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -15,6 +15,7 @@ - #include "ras-logger.h" - #include "ras-report.h" - #include "ras-non-standard-handler.h" -+#include "non-standard-hisilicon.h" - - /* HISI OEM error definitions */ - /* HISI OEM format1 error definitions */ -@@ -83,11 +84,6 @@ - #define HISI_PCIE_LOCAL_ERR_MISC_MAX 33 - #define HISI_BUF_LEN 1024 - --#define HISI_ERR_SEVERITY_NFE 0 --#define HISI_ERR_SEVERITY_FE 1 --#define HISI_ERR_SEVERITY_CE 2 --#define HISI_ERR_SEVERITY_NONE 3 -- - struct hisi_oem_type1_err_sec { - uint32_t val_bits; - uint8_t version; -@@ -145,12 +141,6 @@ struct hisi_pcie_local_err_sec { - uint32_t err_misc[HISI_PCIE_LOCAL_ERR_MISC_MAX]; - }; - --enum hisi_oem_data_type { -- HISI_OEM_DATA_TYPE_INT, -- HISI_OEM_DATA_TYPE_INT64, -- HISI_OEM_DATA_TYPE_TEXT, --}; -- - enum { - HIP08_OEM_TYPE1_FIELD_ID, - HIP08_OEM_TYPE1_FIELD_TIMESTAMP, -@@ -199,20 +189,6 @@ struct hisi_module_info { - int sub_num; - }; - --/* helper functions */ --static char *err_severity(uint8_t err_sev) --{ -- switch (err_sev) { -- case HISI_ERR_SEVERITY_NFE: return "recoverable"; -- case HISI_ERR_SEVERITY_FE: return "fatal"; -- case HISI_ERR_SEVERITY_CE: return "corrected"; -- case HISI_ERR_SEVERITY_NONE: return "none"; -- default: -- break; -- } -- return "unknown"; --} -- - static const char *pll_submodule_name[] = { - "TB_PLL0", - "TB_PLL1", -@@ -549,59 +525,6 @@ static const struct db_table_descriptor hip08_pcie_local_event_tab = { - .fields = hip08_pcie_local_event_fields, - .num_fields = ARRAY_SIZE(hip08_pcie_local_event_fields), - }; -- --static void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -- enum hisi_oem_data_type data_type, -- int id, int64_t data, const char *text) --{ -- switch (data_type) { -- case HISI_OEM_DATA_TYPE_INT: -- sqlite3_bind_int(dec_tab->stmt_dec_record, id, data); -- break; -- case HISI_OEM_DATA_TYPE_INT64: -- sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data); -- break; -- case HISI_OEM_DATA_TYPE_TEXT: -- sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL); -- break; -- default: -- break; -- } --} -- --static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, -- const char *name) --{ -- int rc; -- -- rc = sqlite3_step(dec_tab->stmt_dec_record); -- if (rc != SQLITE_OK && rc != SQLITE_DONE) -- log(TERM, LOG_ERR, -- "Failed to do %s step on sqlite: error = %d\n", name, rc); -- -- rc = sqlite3_reset(dec_tab->stmt_dec_record); -- if (rc != SQLITE_OK && rc != SQLITE_DONE) -- log(TERM, LOG_ERR, -- "Failed to reset %s on sqlite: error = %d\n", name, rc); -- -- rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record); -- if (rc != SQLITE_OK && rc != SQLITE_DONE) -- log(TERM, LOG_ERR, -- "Failed to clear bindings %s on sqlite: error = %d\n", -- name, rc); -- -- return rc; --} --#else --static void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -- enum hisi_oem_data_type data_type, -- int id, int64_t data, const char *text) --{ } -- --static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, char *name) --{ -- return 0; --} - #endif - - #define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end)) -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -new file mode 100644 -index 0000000..c9e1fa9 ---- /dev/null -+++ b/non-standard-hisilicon.c -@@ -0,0 +1,307 @@ -+/* -+ * Copyright (c) 2020 Hisilicon Limited. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+#include -+#include -+#include -+#include "ras-record.h" -+#include "ras-logger.h" -+#include "ras-report.h" -+#include "non-standard-hisilicon.h" -+ -+#define HISI_BUF_LEN 2048 -+ -+struct hisi_common_error_section { -+ uint32_t val_bits; -+ uint8_t version; -+ uint8_t soc_id; -+ uint8_t socket_id; -+ uint8_t totem_id; -+ uint8_t nimbus_id; -+ uint8_t subsystem_id; -+ uint8_t module_id; -+ uint8_t submodule_id; -+ uint8_t core_id; -+ uint8_t port_id; -+ uint16_t err_type; -+ struct { -+ uint8_t function; -+ uint8_t device; -+ uint16_t segment; -+ uint8_t bus; -+ uint8_t reserved[3]; -+ } pcie_info; -+ uint8_t err_severity; -+ uint8_t reserved[3]; -+ uint32_t reg_array_size; -+ uint32_t reg_array[]; -+}; -+ -+enum { -+ HISI_COMMON_VALID_SOC_ID, -+ HISI_COMMON_VALID_SOCKET_ID, -+ HISI_COMMON_VALID_TOTEM_ID, -+ HISI_COMMON_VALID_NIMBUS_ID, -+ HISI_COMMON_VALID_SUBSYSTEM_ID, -+ HISI_COMMON_VALID_MODULE_ID, -+ HISI_COMMON_VALID_SUBMODULE_ID, -+ HISI_COMMON_VALID_CORE_ID, -+ HISI_COMMON_VALID_PORT_ID, -+ HISI_COMMON_VALID_ERR_TYPE, -+ HISI_COMMON_VALID_PCIE_INFO, -+ HISI_COMMON_VALID_ERR_SEVERITY, -+ HISI_COMMON_VALID_REG_ARRAY_SIZE, -+}; -+ -+enum { -+ HISI_COMMON_FIELD_ID, -+ HISI_COMMON_FIELD_TIMESTAMP, -+ HISI_COMMON_FIELD_ERR_INFO, -+ HISI_COMMON_FIELD_REGS_DUMP, -+}; -+ -+struct hisi_event { -+ char error_msg[HISI_BUF_LEN]; -+ char reg_msg[HISI_BUF_LEN]; -+}; -+ -+#ifdef HAVE_SQLITE3 -+void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+ enum hisi_oem_data_type data_type, -+ int id, int64_t data, const char *text) -+{ -+ switch (data_type) { -+ case HISI_OEM_DATA_TYPE_INT: -+ sqlite3_bind_int(dec_tab->stmt_dec_record, id, data); -+ break; -+ case HISI_OEM_DATA_TYPE_INT64: -+ sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data); -+ break; -+ case HISI_OEM_DATA_TYPE_TEXT: -+ sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL); -+ break; -+ } -+} -+ -+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name) -+{ -+ int rc; -+ -+ rc = sqlite3_step(dec_tab->stmt_dec_record); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to do %s step on sqlite: error = %d\n", name, rc); -+ -+ rc = sqlite3_reset(dec_tab->stmt_dec_record); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to reset %s on sqlite: error = %d\n", name, rc); -+ -+ rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to clear bindings %s on sqlite: error = %d\n", -+ name, rc); -+ -+ return rc; -+} -+#else -+void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+ enum hisi_oem_data_type data_type, -+ int id, int64_t data, const char *text) -+{ } -+ -+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef HAVE_SQLITE3 -+static const struct db_fields hisi_common_section_fields[] = { -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "err_info", .type = "TEXT" }, -+ { .name = "regs_dump", .type = "TEXT" }, -+}; -+ -+static const struct db_table_descriptor hisi_common_section_tab = { -+ .name = "hisi_common_section", -+ .fields = hisi_common_section_fields, -+ .num_fields = ARRAY_SIZE(hisi_common_section_fields), -+}; -+#endif -+ -+static const char* soc_desc[] = { -+ "Kunpeng916", -+ "Kunpeng920", -+ "Kunpeng930", -+}; -+ -+static const char* module_name[] = { -+ "MN", -+ "PLL", -+ "SLLC", -+ "AA", -+ "SIOE", -+ "POE", -+ "CPA", -+ "DISP", -+ "GIC", -+ "ITS", -+ "AVSBUS", -+ "CS", -+ "PPU", -+ "SMMU", -+ "PA", -+ "HLLC", -+ "DDRC", -+ "L3TAG", -+ "L3DATA", -+ "PCS", -+ "MATA", -+ "PCIe Local", -+ "SAS", -+ "SATA", -+ "NIC", -+ "RoCE", -+ "USB", -+ "ZIP", -+ "HPRE", -+ "SEC", -+ "RDE", -+ "MEE", -+ "HHA", -+}; -+ -+static const char* get_soc_desc(uint8_t soc_id) -+{ -+ if (soc_id >= sizeof(soc_desc)/sizeof(char *)) -+ return "unknown"; -+ -+ return soc_desc[soc_id]; -+} -+ -+static void decode_module(struct hisi_event *event, uint8_t module_id) -+{ -+ if (module_id >= sizeof(module_name)/sizeof(char *)) -+ HISI_SNPRINTF(event->error_msg, "module=unknown(id=%d) ", module_id); -+ else -+ HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); -+} -+ -+static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab, -+ const struct hisi_common_error_section *err, -+ struct hisi_event *event) -+{ -+ HISI_SNPRINTF(event->error_msg, "[ table_version=%d", err->version); -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) -+ HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id)); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) -+ HISI_SNPRINTF(event->error_msg, "socket_id=%d", err->socket_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) -+ HISI_SNPRINTF(event->error_msg, "totem_id=%d", err->totem_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) -+ HISI_SNPRINTF(event->error_msg, "nimbus_id=%d", err->nimbus_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) -+ HISI_SNPRINTF(event->error_msg, "subsystem_id=%d", err->subsystem_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID)) -+ decode_module(event, err->module_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) -+ HISI_SNPRINTF(event->error_msg, "submodule_id=%d", err->submodule_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) -+ HISI_SNPRINTF(event->error_msg, "core_id=%d", err->core_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) -+ HISI_SNPRINTF(event->error_msg, "port_id=%d", err->port_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) -+ HISI_SNPRINTF(event->error_msg, "err_type=%d", err->err_type); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) -+ HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x", -+ err->pcie_info.segment, err->pcie_info.bus, -+ err->pcie_info.device, err->pcie_info.function); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) -+ HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity)); -+ -+ HISI_SNPRINTF(event->error_msg, "]"); -+} -+ -+static int decode_hisi_common_section(struct ras_events *ras, -+ struct ras_ns_dec_tab *dec_tab, -+ struct trace_seq *s, -+ struct ras_non_standard_event *event) -+{ -+ const struct hisi_common_error_section *err = -+ (struct hisi_common_error_section *)event->error; -+ struct hisi_event hevent; -+ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !dec_tab->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, -+ &hisi_common_section_tab) != SQLITE_OK) { -+ trace_seq_printf(s, "create sql hisi_common_section_tab fail\n"); -+ return -1; -+ } -+ } -+#endif -+ -+ memset(&hevent, 0, sizeof(struct hisi_event)); -+ trace_seq_printf(s, "\nHisilicon Common Error Section:\n"); -+ decode_hisi_common_section_hdr(dec_tab, err, &hevent); -+ trace_seq_printf(s, "%s\n", hevent.error_msg); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) { -+ int i; -+ -+ trace_seq_printf(s, "Register Dump:\n"); -+ for (i = 0; i < err->reg_array_size / sizeof(uint32_t); i++) { -+ trace_seq_printf(s, "reg%02d=0x%08x\n", i, -+ err->reg_array[i]); -+ HISI_SNPRINTF(hevent.reg_msg, "reg%02d=0x%08x", -+ i, err->reg_array[i]); -+ } -+ } -+ -+ if (ras->record_events) { -+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_TIMESTAMP, -+ 0, event->timestamp); -+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg); -+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg); -+ step_vendor_data_tab(dec_tab, "hisi_common_section_tab"); -+ } -+ -+ return 0; -+} -+ -+struct ras_ns_dec_tab hisi_section_ns_tab[] = { -+ { -+ .sec_type = "c8b328a899174af69a132e08ab2e7586", -+ .decode = decode_hisi_common_section, -+ }, -+ { /* sentinel */ } -+}; -+ -+static void __attribute__((constructor)) hisi_ns_init(void) -+{ -+ register_ns_dec_tab(hisi_section_ns_tab); -+} -diff --git a/non-standard-hisilicon.h b/non-standard-hisilicon.h -new file mode 100644 -index 0000000..1ce210a ---- /dev/null -+++ b/non-standard-hisilicon.h -@@ -0,0 +1,49 @@ -+/* -+ * Copyright (c) 2020 Hisilicon Limited. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+#ifndef __NON_STANDARD_HISILICON_H -+#define __NON_STANDARD_HISILICON_H -+ -+#include "ras-non-standard-handler.h" -+#include "ras-mc-handler.h" -+ -+#define HISI_SNPRINTF mce_snprintf -+ -+#define HISI_ERR_SEVERITY_NFE 0 -+#define HISI_ERR_SEVERITY_FE 1 -+#define HISI_ERR_SEVERITY_CE 2 -+#define HISI_ERR_SEVERITY_NONE 3 -+ -+enum hisi_oem_data_type { -+ HISI_OEM_DATA_TYPE_INT, -+ HISI_OEM_DATA_TYPE_INT64, -+ HISI_OEM_DATA_TYPE_TEXT, -+}; -+ -+/* helper functions */ -+static inline char *err_severity(uint8_t err_sev) -+{ -+ switch (err_sev) { -+ case HISI_ERR_SEVERITY_NFE: return "recoverable"; -+ case HISI_ERR_SEVERITY_FE: return "fatal"; -+ case HISI_ERR_SEVERITY_CE: return "corrected"; -+ case HISI_ERR_SEVERITY_NONE: return "none"; -+ default: -+ break; -+ } -+ return "unknown"; -+} -+ -+void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+ enum hisi_oem_data_type data_type, -+ int id, int64_t data, const char *text); -+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name); -+ -+#endif --- -2.7.4 - diff --git a/backport-Fix-ras-mc-ctl-script.patch b/backport-Fix-ras-mc-ctl-script.patch deleted file mode 100644 index b0595dd2397f8050dfac280e2c7a1ca2724a04e9..0000000000000000000000000000000000000000 --- a/backport-Fix-ras-mc-ctl-script.patch +++ /dev/null @@ -1,454 +0,0 @@ -From 546cf713f667437fb6e283cc3dc090679eb47d08 Mon Sep 17 00:00:00 2001 -From: Subhendu Saha -Date: Tue, 12 Jan 2021 03:29:55 -0500 -Subject: [PATCH] Fix ras-mc-ctl script. - -When rasdaemon is compiled without enabling aer, mce, devlink, -etc., those tables are not created in the database file. Then -ras-mc-ctl script breaks trying to query data from non-existent -tables. - -Signed-off-by: Subhendu Saha subhends@akamai.com -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 384 ++++++++++++++++++++++++--------------------- - 1 file changed, 208 insertions(+), 176 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 665a042..be9d983 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -41,6 +41,18 @@ my $sysconfdir = "@sysconfdir@"; - my $dmidecode = find_prog ("dmidecode"); - my $modprobe = find_prog ("modprobe") or exit (1); - -+my $has_aer = 0; -+my $has_devlink = 0; -+my $has_disk_errors = 0; -+my $has_extlog = 0; -+my $has_mce = 0; -+ -+@WITH_AER_TRUE@$has_aer = 1; -+@WITH_DEVLINK_TRUE@$has_devlink = 1; -+@WITH_DISKERROR_TRUE@$has_disk_errors = 1; -+@WITH_EXTLOG_TRUE@$has_extlog = 1; -+@WITH_MCE_TRUE@$has_mce = 1; -+ - my %conf = (); - my %bus = (); - my %dimm_size = (); -@@ -1143,86 +1155,96 @@ sub summary - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($err_type, $msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $err_type errors: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events summary:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($err_type, $msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $err_type errors: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events summary:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # extlog errors -- $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($etype, $severity, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "\t$count $etype_string $severity_string errors\n"; -- } -- if ($out ne "") { -- print "Extlog records summary:\n$out"; -- } else { -- print "No Extlog errors.\n\n"; -+ if ($has_extlog == 1) { -+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($etype, $severity, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "\t$count $etype_string $severity_string errors\n"; -+ } -+ if ($out ne "") { -+ print "Extlog records summary:\n$out"; -+ } else { -+ print "No Extlog errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # devlink errors -- $query = "select dev_name, count(*) from devlink_event group by dev_name"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($dev_name, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$dev_name has $count errors\n"; -- } -- if ($out ne "") { -- print "Devlink records summary:\n$out"; -- } else { -- print "No devlink errors.\n"; -+ if ($has_devlink == 1) { -+ $query = "select dev_name, count(*) from devlink_event group by dev_name"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($dev_name, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$dev_name has $count errors\n"; -+ } -+ if ($out ne "") { -+ print "Devlink records summary:\n$out"; -+ } else { -+ print "No devlink errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # Disk errors -- $query = "select dev, count(*) from disk_errors group by dev"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($dev, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$dev has $count errors\n"; -- } -- if ($out ne "") { -- print "Disk errors summary:\n$out"; -- } else { -- print "No disk errors.\n"; -+ if ($has_disk_errors == 1) { -+ $query = "select dev, count(*) from disk_errors group by dev"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($dev, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$dev has $count errors\n"; -+ } -+ if ($out ne "") { -+ print "Disk errors summary:\n$out"; -+ } else { -+ print "No disk errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select error_msg, count(*) from mce_record group by error_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $msg errors\n"; -- } -- if ($out ne "") { -- print "MCE records summary:\n$out"; -- } else { -- print "No MCE errors.\n"; -+ if ($has_mce == 1) { -+ $query = "select error_msg, count(*) from mce_record group by error_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $msg errors\n"; -+ } -+ if ($out ne "") { -+ print "MCE records summary:\n$out"; -+ } else { -+ print "No MCE errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } -@@ -1259,128 +1281,138 @@ sub errors - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $devname, $type, $msg)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time $devname $type error: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $devname, $type, $msg)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time $devname $type error: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # Extlog errors -- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "$id $timestamp error: "; -- $out .= "type=$etype_string, "; -- $out .= "severity=$severity_string, "; -- $out .= sprintf "address=0x%08x, ", $addr; -- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -- $out .= "fru_text='$fru_text', "; -- $out .= get_cper_data_text($cper_data) if ($cper_data); -- $out .= "\n"; -- } -- if ($out ne "") { -- print "Extlog events:\n$out\n"; -- } else { -- print "No Extlog errors.\n\n"; -+ if ($has_extlog == 1) { -+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "$id $timestamp error: "; -+ $out .= "type=$etype_string, "; -+ $out .= "severity=$severity_string, "; -+ $out .= sprintf "address=0x%08x, ", $addr; -+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -+ $out .= "fru_text='$fru_text', "; -+ $out .= get_cper_data_text($cper_data) if ($cper_data); -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "Extlog events:\n$out\n"; -+ } else { -+ print "No Extlog errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # devlink errors -- $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $timestamp error: "; -- $out .= "bus_name=$bus_name, "; -- $out .= "dev_name=$dev_name, "; -- $out .= "driver_name=$driver_name, "; -- $out .= "reporter_name=$reporter_name, "; -- $out .= "message='$msg', "; -- $out .= "\n"; -- } -- if ($out ne "") { -- print "Devlink events:\n$out\n"; -- } else { -- print "No devlink errors.\n\n"; -+ if ($has_devlink == 1) { -+ $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "bus_name=$bus_name, "; -+ $out .= "dev_name=$dev_name, "; -+ $out .= "driver_name=$driver_name, "; -+ $out .= "reporter_name=$reporter_name, "; -+ $out .= "message='$msg', "; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "Devlink events:\n$out\n"; -+ } else { -+ print "No devlink errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # Disk errors -- $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $timestamp error: "; -- $out .= "dev=$dev, "; -- $out .= "sector=$sector, "; -- $out .= "nr_sector=$nr_sector, "; -- $out .= "error='$error', "; -- $out .= "rwbs='$rwbs', "; -- $out .= "cmd='$cmd', "; -- $out .= "\n"; -- } -- if ($out ne "") { -- print "Disk errors\n$out\n"; -- } else { -- print "No disk errors.\n\n"; -+ if ($has_disk_errors == 1) { -+ $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "dev=$dev, "; -+ $out .= "sector=$sector, "; -+ $out .= "nr_sector=$nr_sector, "; -+ $out .= "error='$error', "; -+ $out .= "rwbs='$rwbs', "; -+ $out .= "cmd='$cmd', "; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "Disk errors\n$out\n"; -+ } else { -+ print "No disk errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time error: $msg"; -- $out .= ", CPU $cpuvendor" if ($cpuvendor); -- $out .= ", bank $bank_name" if ($bank_name); -- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -- $out .= ", $mc_location" if ($mc_location); -- $out .= ", $user_action" if ($user_action); -- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -- $out .= sprintf ", status=0x%08x", $status if ($status); -- $out .= sprintf ", addr=0x%08x", $addr if ($addr); -- $out .= sprintf ", misc=0x%08x", $misc if ($misc); -- $out .= sprintf ", ip=0x%08x", $ip if ($ip); -- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -- $out .= sprintf ", cs=0x%08x", $cs if ($cs); -- $out .= sprintf ", bank=0x%08x", $bank if ($bank); -- -- $out .= "\n"; -- } -- if ($out ne "") { -- print "MCE events:\n$out\n"; -- } else { -- print "No MCE errors.\n\n"; -+ if ($has_mce == 1) { -+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time error: $msg"; -+ $out .= ", CPU $cpuvendor" if ($cpuvendor); -+ $out .= ", bank $bank_name" if ($bank_name); -+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -+ $out .= ", $mc_location" if ($mc_location); -+ $out .= ", $user_action" if ($user_action); -+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -+ $out .= sprintf ", status=0x%08x", $status if ($status); -+ $out .= sprintf ", addr=0x%08x", $addr if ($addr); -+ $out .= sprintf ", misc=0x%08x", $misc if ($misc); -+ $out .= sprintf ", ip=0x%08x", $ip if ($ip); -+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -+ $out .= sprintf ", cs=0x%08x", $cs if ($cs); -+ $out .= sprintf ", bank=0x%08x", $bank if ($bank); -+ -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "MCE events:\n$out\n"; -+ } else { -+ print "No MCE errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } --- -2.27.0 - diff --git a/backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch b/backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch new file mode 100644 index 0000000000000000000000000000000000000000..b4ba376f865f15e0bab4a52f2444f8a1367954ac --- /dev/null +++ b/backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch @@ -0,0 +1,37 @@ +From 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4 Mon Sep 17 00:00:00 2001 +From: Matt Whitlock +Date: Wed, 9 Jun 2021 10:25:18 -0400 +Subject: [PATCH] configure.ac: fix SYSCONFDEFDIR default value + +configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like: + + # Check whether --with-sysconfdefdir was given. + if test "${with_sysconfdefdir+set}" = set; then : + withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval + else + "/etc/sysconfig" + fi + +This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command. + +Signed-off-by: Mauro Carvalho Chehab +--- + configure.ac | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/configure.ac b/configure.ac +index f7d1947..33b81fe 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR]) + AC_ARG_WITH(sysconfdefdir, + AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]), + [SYSCONFDEFDIR=$withval], +- ["/etc/sysconfig"]) ++ [SYSCONFDEFDIR=/etc/sysconfig]) + AC_SUBST([SYSCONFDEFDIR]) + + AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database]) +-- +2.27.0 + diff --git a/backport-ras-mc-ctl-PCIe-AER-display-PCIe-dev-name.patch b/backport-ras-mc-ctl-PCIe-AER-display-PCIe-dev-name.patch deleted file mode 100644 index 2a89729d784b05da1c88af22bc50c92c6185ad71..0000000000000000000000000000000000000000 --- a/backport-ras-mc-ctl-PCIe-AER-display-PCIe-dev-name.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 059a901e97f4091e31c50ce55027daf707638f8d Mon Sep 17 00:00:00 2001 -From: dann frazier -Date: Tue, 21 Apr 2020 15:56:04 -0600 -Subject: [PATCH] ras-mc-ctl: PCIe AER: display PCIe dev name - -Storage of PCIe dev name was added in commit 8e96ca2c1c59 ("rasdaemon: -store PCIe dev name and TLP header for the aer event"). This makes -ras-mc-ctl extract and emit it like so: - -PCIe AER events: -1 2020-04-16 22:09:48 +0000 0000:0b:00.0 Corrected error: Receiver Error -2 2020-04-16 22:23:24 +0000 0000:0b:00.0 Corrected error: Receiver Error -3 2020-04-17 23:00:37 +0000 0000:d9:01.0 Corrected error: Advisory Non-Fatal, BIT15 -4 2020-04-17 23:21:52 +0000 0000:d9:01.0 Corrected error: Advisory Non-Fatal -5 2020-04-18 02:04:24 +0000 0000:5e:00.0 Corrected error: Receiver Error - -Signed-off-by: Dann Frazier -Tested-by: Shiju Jose ---- - util/ras-mc-ctl.in | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 8d6d866..665a042 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1230,7 +1230,7 @@ sub summary - sub errors - { - require DBI; -- my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); -+ my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); - my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); - my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); - my ($bus_name, $dev_name, $driver_name, $reporter_name); -@@ -1259,13 +1259,13 @@ sub errors - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; -+ $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $type, $msg)); -+ $query_handle->bind_columns(\($id, $time, $devname, $type, $msg)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id $time $type error: $msg\n"; -+ $out .= "$id $time $devname $type error: $msg\n"; - } - if ($out ne "") { - print "PCIe AER events:\n$out\n"; diff --git a/backport-rasdaemon-Fix-error-print.patch b/backport-rasdaemon-Fix-error-print.patch deleted file mode 100644 index 6e315ba2a9e4d154ac6842c1e76f200eef9bf9b3..0000000000000000000000000000000000000000 --- a/backport-rasdaemon-Fix-error-print.patch +++ /dev/null @@ -1,29 +0,0 @@ -From 00115dda854f4a50681ccc6c017daa991234411b Mon Sep 17 00:00:00 2001 -From: Liguang Zhang -Date: Mon, 10 Aug 2020 11:07:43 +0800 -Subject: [PATCH] rasdaemon: Fix error print - -Fix error print handle_ras_events. - -Signed-off-by: Liguang Zhang -Signed-off-by: Mauro Carvalho Chehab ---- - ras-events.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/ras-events.c b/ras-events.c -index a99fd29..c797b20 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -874,7 +874,7 @@ int handle_ras_events(int record_events) - num_events++; - } else - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", -- "ras", "aer_event"); -+ "ras", "extlog_mem_event"); - #endif - - #ifdef HAVE_DEVLINK --- -2.18.4 - diff --git a/backport-rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch b/backport-rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch new file mode 100644 index 0000000000000000000000000000000000000000..d9331e5ef3b0227ec50768c30a1eec8db7b93593 --- /dev/null +++ b/backport-rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch @@ -0,0 +1,56 @@ +From 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b Mon Sep 17 00:00:00 2001 +From: Muralidhara M K +Date: Tue, 27 Jul 2021 06:36:45 -0500 +Subject: [PATCH] rasdaemon: ras-mc-ctl: Fix script to parse dimm sizes + +Removes trailing spaces at the end of a line from +file location and fixes --layout option to parse dimm nodes +to get the size of each dimm from ras-mc-ctl. + +Issue is reported https://github.com/mchehab/rasdaemon/issues/43 +Where '> ras-mc-ctl --layout' reports all 0s + +With this change the layout option prints the correct dimm sizes +> sudo ras-mc-ctl --layout + +-----------------------------------------------+ + | mc0 | + | csrow0 | csrow1 | csrow2 | csrow3 | +----------+-----------------------------------------------+ +... +channel7: | 16384 MB | 0 MB | 0 MB | 0 MB | +channel6: | 16384 MB | 0 MB | 0 MB | 0 MB | +... +----------+-----------------------------------------------+ + +Signed-off-by: Muralidhara M K +Signed-off-by: Naveen Krishna Chatradhi +Cc: Yazen Ghannam +Signed-off-by: Mauro Carvalho Chehab +Link: https://lkml.kernel.org/r/20210810183855.129076-1-nchatrad@amd.com/ +--- + util/ras-mc-ctl.in | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1e3aeb7..b22dd60 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -246,6 +246,7 @@ sub parse_dimm_nodes + if (($file =~ /max_location$/)) { + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + my @temp = split(/ /, $location); + +@@ -288,6 +289,7 @@ sub parse_dimm_nodes + + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + + my @pos; +-- +2.27.0 + diff --git a/backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch b/backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch new file mode 100644 index 0000000000000000000000000000000000000000..ed749fedac8585c6fa45a91f55c534ffdaa08534 --- /dev/null +++ b/backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch @@ -0,0 +1,34 @@ +From ce33041e0abfa20054ff5d6874ffbd1ab592558d Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Thu, 19 Jan 2023 08:45:57 -0500 +Subject: [PATCH] rasdaemon: ras-memory-failure-handler: handle localtime() + failure correctly + +We could just have an empty string but keeping the format could prevent +issues if someone is actually parsing this. +Found with covscan. + +v2: fixed the timestamp as pointed by Robert Elliott + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + ras-memory-failure-handler.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 9941e68..1951456 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -148,6 +148,8 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + if (tm) + strftime(ev.timestamp, sizeof(ev.timestamp), + "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); + trace_seq_printf(s, "%s ", ev.timestamp); + + if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0) +-- +2.27.0 + diff --git a/backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch b/backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch new file mode 100644 index 0000000000000000000000000000000000000000..4c7953cda6346928af1674bd59c795a798f91800 --- /dev/null +++ b/backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch @@ -0,0 +1,93 @@ +From 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Thu, 19 Jan 2023 08:45:57 -0500 +Subject: [PATCH] rasdaemon: ras-report: fix possible but unlikely file + descriptor leak + +Found with covscan. + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + ras-report.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/ras-report.c b/ras-report.c +index ea3a9b6..62d5eb7 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -434,7 +434,7 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ + + mc_fail: + +- if(sockfd > 0){ ++ if(sockfd >= 0){ + close(sockfd); + } + +@@ -484,7 +484,7 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ + + aer_fail: + +- if(sockfd > 0){ ++ if(sockfd >= 0){ + close(sockfd); + } + +@@ -533,7 +533,7 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar + + non_standard_fail: + +- if(sockfd > 0){ ++ if(sockfd >= 0){ + close(sockfd); + } + +@@ -578,7 +578,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){ + + arm_fail: + +- if(sockfd > 0){ ++ if(sockfd >= 0){ + close(sockfd); + } + +@@ -624,7 +624,7 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ + + mce_fail: + +- if(sockfd > 0){ ++ if(sockfd >= 0){ + close(sockfd); + } + +@@ -674,7 +674,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev){ + + devlink_fail: + +- if(sockfd > 0){ ++ if(sockfd >= 0){ + close(sockfd); + } + +@@ -723,7 +723,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e + done = 1; + + diskerror_fail: +- if(sockfd > 0){ ++ if(sockfd >= 0){ + close(sockfd); + } + +@@ -768,7 +768,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) + done = 1; + + mf_fail: +- if (sockfd > 0) ++ if (sockfd >= 0) + close(sockfd); + + if (done) +-- +2.27.0 + diff --git a/bugfix-fix-disk-error-log-storm.patch b/bugfix-fix-disk-error-log-storm.patch index 8241cfcb7e5199ab323c9c478b4e671e3283da13..5df02999047c4418daf3afa89c34dcba46679996 100644 --- a/bugfix-fix-disk-error-log-storm.patch +++ b/bugfix-fix-disk-error-log-storm.patch @@ -15,7 +15,7 @@ index e73a08a..04a0489 100644 @@ -4,7 +4,7 @@ After=syslog.target [Service] - EnvironmentFile=/etc/sysconfig/rasdaemon + EnvironmentFile=@SYSCONFDEFDIR@/rasdaemon -ExecStart=@sbindir@/rasdaemon -f -r +ExecStart=@sbindir@/rasdaemon -f ExecStartPost=@sbindir@/rasdaemon --enable diff --git a/bugfix-fix-where-local-variables-are-not-initialized.patch b/bugfix-fix-where-local-variables-are-not-initialized.patch deleted file mode 100644 index 43afc9e6fcb65dde99866d630bb33bbea9a9fb97..0000000000000000000000000000000000000000 --- a/bugfix-fix-where-local-variables-are-not-initialized.patch +++ /dev/null @@ -1,34 +0,0 @@ -From fd8c8d1f66a9058a27c2d1fbfb11225499abebb1 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Wed, 15 Dec 2021 12:54:41 +0800 -Subject: [PATCH] fix where local variables are not initialized - ---- - ras-cpu-isolation.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index bca7e0b..acef1ad 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -112,6 +112,8 @@ static int init_cpu_info(unsigned cpus) - } - - for (unsigned int i = 0; i < cpus; ++i) { -+ cpu_infos[i].ce_nums = 0; -+ cpu_infos[i].uce_nums = 0; - cpu_infos[i].state = get_cpu_status(i); - cpu_infos[i].ce_queue = init_queue(); - if (cpu_infos[i].ce_queue == NULL) { -@@ -384,6 +386,8 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) - log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", - cpu, cpu_state[cpu_infos[cpu].state]); - clear_queue(cpu_infos[cpu].ce_queue); -+ cpu_infos[cpu].ce_nums = 0; -+ cpu_infos[cpu].uce_nums = 0; - } - else { - log(TERM, LOG_INFO, "Offline cpu%d fail, the state is %s\n", --- -2.27.0 - diff --git a/bugfix-modify-the-way-counting-cpu-logical-index.patch b/bugfix-modify-the-way-counting-cpu-logical-index.patch deleted file mode 100644 index bd6cd441100075474b3a30d3275025f1cbf99511..0000000000000000000000000000000000000000 --- a/bugfix-modify-the-way-counting-cpu-logical-index.patch +++ /dev/null @@ -1,234 +0,0 @@ -From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001 -From: Lostwayzxc -Date: Wed, 24 Nov 2021 09:43:52 +0800 -Subject: [PATCH] modify the way counting cpu logical index - -It's hard to count cpu logical index according to the mpidr in the userspace, -so the index will be counted in the kernel before reported to userspace now. - -Related patches: -0006-add-cpu-online-fault-isolation.patch -0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch - ---- - ras-arm-handler.c | 8 ++- - ras-cpu-isolation.c | 127 ++------------------------------------------ - ras-cpu-isolation.h | 6 +-- - 3 files changed, 11 insertions(+), 130 deletions(-) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 8a229b4..47f9a57 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s, - trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); - - #ifdef HAVE_CPU_FAULT_ISOLATION -+ int cpu; -+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) -+ return -1; -+ cpu = val; -+ trace_seq_printf(s, "\n cpu: %d", cpu); -+ - /* record cpu error */ - if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) - return -1; -@@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s, - nums = count_errors(event, ev.error_info, len); - if (nums > 0) { - struct error_info err_info = {nums, now, val}; -- ras_record_cpu_error(&err_info, ev.mpidr); -+ ras_record_cpu_error(&err_info, cpu); - } - } - #endif -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index b1643c4..bca7e0b 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -24,13 +24,9 @@ - #include "ras-cpu-isolation.h" - - static struct cpu_info *cpu_infos = NULL; --static unsigned int ncores, cores_per_socket, cores_per_die; --static unsigned int cores_per_cluster = 4; --static unsigned int sockets, dies = 1; -+static unsigned int ncores; - static unsigned int enabled = 1; - static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; --static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; --static const char *node_path = "/sys/devices/system/node/possible"; - - static const struct param normal_units[] = { - { "", 1 }, -@@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format) - return fd; - } - --static int get_sockets(void) --{ -- int fd, j; -- char buf[MAX_BUF_LEN] = ""; -- cores_per_socket = ncores; -- struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); -- -- if (!cpu_sets) { -- log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); -- return -1; -- } -- -- for (int i = 0; i < ncores; ++i) { -- fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); -- if (fd == -1) { -- continue; -- } -- memset(buf, '\0', strlen(buf)); -- if (read(fd, buf, sizeof(buf)) <= 0) { -- close(fd); -- continue; -- } -- for (j = 0; j < sockets; ++j) { -- if (strcmp(cpu_sets[j].buf, buf) == 0) { -- break; -- } -- } -- if (j == sockets) { -- strcpy(cpu_sets[sockets].buf, buf); -- sockets++; -- } -- close(fd); -- } -- -- free(cpu_sets); -- cores_per_socket = sockets > 0 ? ncores / sockets : ncores; -- -- return 0; --} -- --static int get_dies(void) --{ -- int fd, begin, end; -- char buf[20] = ""; -- cores_per_die = ncores; -- fd = open(node_path, O_RDONLY); -- -- if (fd == -1) { -- return -1; -- } -- -- if (read(fd, buf, sizeof(buf))) { -- if (sscanf(buf, "%d-%d", &begin, &end) == 2) { -- dies = end > begin ? end - begin + 1 : 1; -- } -- } -- -- close(fd); -- cores_per_die = ncores / dies; -- -- return 0; --} -- - static int get_cpu_status(unsigned cpu) - { - int fd, num; -@@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus) - cpu_limit.limit = cpus - 1; - cpu_limit.value = 0; - -- if (get_sockets() < 0 || get_dies() < 0) { -- log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); -- return -1; -- } -- - return 0; - } - -@@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info) - } - } - --static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) -+void ras_record_cpu_error(struct error_info *err_info, int cpu) - { -- value >>= offset; -- unsigned long res = 0; -- int i = 0; -- -- while (i < size) { -- res |= (value & (0x1 << (i++))); -- } -- -- return res; --} -- --static unsigned get_cpu_index(int64_t mpidr) --{ -- unsigned core_id, cluster_id, socket_id, die_id, cpu; -- /* -- * Adapt to certain BIOS -- * In the MPIDR: -- * bit 8:15: core id -- * bit 16:18: cluster id -- * bit 19:20: die_id -- * bit 21:22: socket_id -- */ -- core_id = get_bit_value(mpidr, 8, 8); -- cluster_id = get_bit_value(mpidr, 16, 3); -- socket_id = get_bit_value(mpidr, 21, 2); -- die_id = get_bit_value(mpidr, 19, 2); -- -- /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3, -- * it means TotemB. When cores per die equal to cores per socket, it means -- * that there is only one die in the socket, in case that the only die is -- * TotemB in CPU 1620s, we set die id to 0 directly. -- */ -- if (cores_per_die == cores_per_socket) { -- die_id = 0; -- } -- else { -- die_id = (die_id == 1 ? 0:1); -- } -- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die + -- cluster_id * cores_per_cluster; -- -- return cpu; --} -- --void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) --{ -- unsigned cpu; - int ret; - - if (enabled == 0) { - return; - } - -- cpu = get_cpu_index(mpidr); -- -- if (cpu >= ncores) { -+ if (cpu >= ncores || cpu < 0) { - log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); - return; - } -diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h -index a7d3fdb..95dedc1 100644 ---- a/ras-cpu-isolation.h -+++ b/ras-cpu-isolation.h -@@ -65,12 +65,8 @@ struct error_info { - enum error_type err_type; - }; - --struct cpu_set { -- char buf[MAX_BUF_LEN]; --}; -- - void ras_error_count_init(unsigned cpus); --void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); -+void ras_record_cpu_error(struct error_info *err_info, int cpu); - void cpu_infos_free(void); - - #endif -\ No newline at end of file --- -2.27.0 - diff --git a/bugfix-ras-events-memory-leak.patch b/bugfix-ras-events-memory-leak.patch deleted file mode 100644 index 977459a356d82fbfd29f4486038a5f9e685b2c43..0000000000000000000000000000000000000000 --- a/bugfix-ras-events-memory-leak.patch +++ /dev/null @@ -1,18 +0,0 @@ -From d59e4d224b3271cf7a7fe53cd7c5d539b58eac32 Mon Sep 17 00:00:00 2001 -From: lvying -Date: Sat, 26 Jan 2019 15:54:17 +0800 -Subject: [PATCH] rasdaemon:fix ras events memory leak - -reason:fix ras events memory leak - -diff -uprN a/ras-events.c b/ras-events.c ---- a/ras-events.c 2018-06-22 14:20:42.880878700 +0800 -+++ b/ras-events.c 2018-06-22 14:38:24.420726900 +0800 -@@ -314,6 +314,7 @@ static void parse_ras_data(struct pthrea - trace_seq_init(&s); - pevent_print_event(pdata->ras->pevent, &s, &record); - trace_seq_do_printf(&s); -+ trace_seq_destroy(&s); - printf("\n"); - fflush(stdout); - } diff --git a/fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch b/fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch new file mode 100644 index 0000000000000000000000000000000000000000..37e88dfac635ea9a8bbe692b872cc66df074ae12 --- /dev/null +++ b/fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch @@ -0,0 +1,41 @@ +From d439975850f947ced01423dc4bb4d6406022b4e1 Mon Sep 17 00:00:00 2001 +From: hubin +Date: Thu, 18 May 2023 16:14:41 +0800 +Subject: [PATCH] ras-events: quit loop in read_ras_event when kbuf data is + broken + +when kbuf data is broken, kbuffer_next_event() may move kbuf->index back to +the current kbuf->index position, causing dead loop. + +In this situation, rasdaemon will repeatedly parse an invalid event, and +print warning like "ug! negative record size -8!", pushing cpu utilization +rate to 100%. + +when kbuf data is broken, discard current page and continue reading next page +kbuf. + +Signed-off-by: hubin +--- + ras-events.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/ras-events.c b/ras-events.c +index 1479732..11ecb4d 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -498,6 +498,11 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + kbuffer_load_subbuffer(kbuf, page); + + while ((data = kbuffer_read_event(kbuf, &time_stamp))) { ++ if (kbuffer_curr_size(kbuf) < 0) { ++ log(TERM, LOG_ERR, "invalid kbuf data, discard\n"); ++ break; ++ } ++ + parse_ras_data(&pdata[i], + kbuf, data, time_stamp); + +-- +2.33.0 + + diff --git a/fix-ras-mc-ctl.service-startup-failed-when-selinux-is-no.patch b/fix-ras-mc-ctl.service-startup-failed-when-selinux-is-no.patch new file mode 100644 index 0000000000000000000000000000000000000000..45abe97882231d6c11df1baab9c902eaa3ab18b4 --- /dev/null +++ b/fix-ras-mc-ctl.service-startup-failed-when-selinux-is-no.patch @@ -0,0 +1,25 @@ +From fd9341f5f7f3896c4de2a9a90d7dc366fd2ffedc Mon Sep 17 00:00:00 2001 +From: shixuantong +Date: Thu, 1 Dec 2022 12:39:11 +0000 +Subject: [PATCH] fix ras-mc-ctl.service startup failed when selinux is on + +--- + util/ras-mc-ctl.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 9198a23..888b4e8 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -39,7 +39,7 @@ my $dbname = "@RASSTATEDIR@/@RAS_DB_FNAME@"; + my $prefix = "@prefix@"; + my $sysconfdir = "@sysconfdir@"; + my $dmidecode = find_prog ("dmidecode"); +-my $modprobe = find_prog ("modprobe") or exit (1); ++my $modprobe = find_prog ("modprobe"); + + my $has_aer = 0; + my $has_arm = 0; +-- +2.33.0 + diff --git a/rasdaemon-0.6.6.tar.gz b/rasdaemon-0.6.6.tar.gz deleted file mode 100644 index ea4552e542487c2c4d2e870b222aca8097d8df7c..0000000000000000000000000000000000000000 Binary files a/rasdaemon-0.6.6.tar.gz and /dev/null differ diff --git a/rasdaemon-0.6.7.tar.gz b/rasdaemon-0.6.7.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..553577b805824808efe0b1649b8a328960ad7dbd Binary files /dev/null and b/rasdaemon-0.6.7.tar.gz differ diff --git a/rasdaemon-diskerror-fix-incomplete-diskerror-log.patch b/rasdaemon-diskerror-fix-incomplete-diskerror-log.patch new file mode 100644 index 0000000000000000000000000000000000000000..16954f421f2f24ac708b34692986defc3023c47d --- /dev/null +++ b/rasdaemon-diskerror-fix-incomplete-diskerror-log.patch @@ -0,0 +1,62 @@ +From be5ea839fd52453f01ceb131813fb2e6919684ab Mon Sep 17 00:00:00 2001 +From: Lv Ying +Date: Thu, 15 Dec 2022 21:01:59 +0800 +Subject: [PATCH] rasdaemon/diskerror: fix incomplete diskerror log + +Currently, rasdaemon output incomplete diskerror log(only contains timestamp): +-0 [000] 0.017915: block_rq_complete: 2022-12-16 04:17:32 +0800 + +Fix incomplete diskerror log just like block_rq_complete tracepoint output format: +-0 [042] d.h. 177962.715669: block_rq_complete: 21,0 N () 18446744073709551615 + 0 [-121] +--- + ras-diskerror-handler.c | 22 ++++++++++++++-------- + 1 file changed, 14 insertions(+), 8 deletions(-) + +diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c +index b16319f..0a6e315 100644 +--- a/ras-diskerror-handler.c ++++ b/ras-diskerror-handler.c +@@ -97,26 +97,32 @@ int ras_diskerror_event_handler(struct trace_seq *s, + dev = (dev_t)val; + if (asprintf(&ev.dev, "%u:%u", major(dev), minor(dev)) < 0) + return -1; ++ trace_seq_printf(s, "%s ", ev.dev); ++ ++ ev.rwbs = pevent_get_field_raw(s, event, "rwbs", record, &len, 1); ++ if (!ev.rwbs) ++ return -1; ++ trace_seq_printf(s, "%s ", ev.rwbs); ++ ++ ev.cmd = pevent_get_field_raw(s, event, "cmd", record, &len, 1); ++ if (!ev.cmd) ++ return -1; ++ trace_seq_printf(s, "(%s) ", ev.cmd); + + if (pevent_get_field_val(s, event, "sector", record, &val, 1) < 0) + return -1; + ev.sector = val; ++ trace_seq_printf(s, "%llu ", ev.sector); + + if (pevent_get_field_val(s, event, "nr_sector", record, &val, 1) < 0) + return -1; + ev.nr_sector = (unsigned int)val; ++ trace_seq_printf(s, "+ %u ", ev.nr_sector); + + if (pevent_get_field_val(s, event, "error", record, &val, 1) < 0) + return -1; + ev.error = get_blk_error((int)val); +- +- ev.rwbs = pevent_get_field_raw(s, event, "rwbs", record, &len, 1); +- if (!ev.rwbs) +- return -1; +- +- ev.cmd = pevent_get_field_raw(s, event, "cmd", record, &len, 1); +- if (!ev.cmd) +- return -1; ++ trace_seq_printf(s, "[%s]", ev.error); + + /* Insert data into the SGBD */ + #ifdef HAVE_SQLITE3 +-- +2.38.1 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 02fd7dec9c996d08d2d40aec6f1f1d87fdfa1a32..95266a7b5e323e27c5598f4256731498325a94b8 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,10 +1,10 @@ Name: rasdaemon -Version: 0.6.6 -Release: 10 +Version: 0.6.7 +Release: 13 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git -Source0: %{name}-%{version}.tar.gz +Source0: https://github.com/mchehab/rasdaemon/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz ExcludeArch: s390 s390x BuildRequires: gcc, gettext-devel, perl-generators, sqlite-devel, systemd, git, libtool @@ -19,28 +19,42 @@ Requires(post): systemd Requires(preun): systemd Requires(postun): systemd -Patch1: bugfix-ras-events-memory-leak.patch -Patch2: bugfix-rasdaemon-wait-for-file-access.patch -Patch3: bugfix-fix-fd-check.patch -Patch4: bugfix-fix-disk-error-log-storm.patch -Patch5: backport-rasdaemon-Fix-error-print.patch -Patch6: backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch -Patch7: backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch -Patch8: backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch -Patch9: backport-0001-rasdaemon-Modify-non-standard-error-decoding-interfa.patch -Patch10: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch -Patch11: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch -Patch12: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch -Patch13: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch -Patch14: 0006-add-cpu-online-fault-isolation.patch -Patch15: 0007-add-trace-print-and-add-sqlite-store.patch -Patch16: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch -Patch17: bugfix-modify-the-way-counting-cpu-logical-index.patch -Patch18: bugfix-fix-where-local-variables-are-not-initialized.patch -Patch19: backport-ras-mc-ctl-PCIe-AER-display-PCIe-dev-name.patch -Patch20: backport-Fix-ras-mc-ctl-script.patch +Patch1: bugfix-rasdaemon-wait-for-file-access.patch +Patch2: bugfix-fix-fd-check.patch +Patch3: bugfix-fix-disk-error-log-storm.patch +Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch +Patch5: 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch +Patch6: 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch +Patch7: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch +Patch8: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch +Patch9: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch +Patch10: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch +Patch11: 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +Patch12: 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +Patch13: 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +Patch14: 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +Patch15: 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +Patch16: 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +Patch17: 0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch +Patch18: 0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch +Patch19: 0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch +Patch20: rasdaemon-diskerror-fix-incomplete-diskerror-log.patch Patch21: backport-traceevent-Add-proper-KBUFFER_TYPE_TIME_STAMP-handling.patch +Patch6000: backport-rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch +Patch6001: backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch +Patch6002: backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch + +Patch9000: fix-ras-mc-ctl.service-startup-failed-when-selinux-is-no.patch +Patch9001: 0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch +Patch9002: 0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch +Patch9003: 0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch +Patch9004: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch +Patch9005: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch +Patch9006: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch +Patch9007: fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch + + %description The rasdaemon program is a daemon which monitors the platform Reliablity, Availability and Serviceability (RAS) reports from the @@ -58,7 +72,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -78,7 +92,6 @@ rm INSTALL %{buildroot}/usr/include/*.h %{_sbindir}/ras-mc-ctl %{_mandir}/*/* %{_unitdir}/*.service -%{_sharedstatedir}/rasdaemon %{_sysconfdir}/ras/dimm_labels.d %config(noreplace) %{_sysconfdir}/sysconfig/%{name} @@ -86,69 +99,142 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog -* Sat Jun 17 2023 yanglongkang - 0.6.6-10 +* Tue Jun 20 2023 zhangnan - 0.6.7-13 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC:ras-events:quit loop in read_ras_event when kbuf data is broken + +* Sat Jun 17 2023 yanglongkang - 0.6.7-12 - Type:bugfix - ID:NA - SUG:NA - DESC:backport libtraceevent patch to adapt to kernel ftrace ring buffer change -* Tue Mar 21 2023 shixuantong - 0.6.6-9 +* Fri Jun 2 2023 Shiju Jose - 0.6.7-11 - Type:bugfix - ID:NA - SUG:NA -- DESC:Fix ras-mc-ctl script +- DESC: + 1. Fix return value type issue of read/write function from unistd.h. + 2. Fix issue of signed and unsigned integer comparison. + 3. Remove redundant header file and do some cleaup. + 4. Add support for create/open the vendor error tables at rasdaemon startup. + 5. Make changes in the HiSilicon error handling code for the same. + 6. Add four modules supported by HiSilicon common section. + +* Tue Apr 4 2023 huangfangrun - 0.6.7-10 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC: + 1.Fix for regression in ras_mc_create_table() if some cpus are offline at the system start. + 2.Fix poll() on per_cpu trace_pipe_raw blocks indefinitely. -* Wed Dec 15 2021 luoshengwei - 0.6.6-8 +* Wed Mar 29 2023 Lv Ying - 0.6.7-9 - Type:bugfix - ID:NA - SUG:NA -- DESC: Add initialization to some local variables when they are cleaned -- or defined. +- DESC:fix ras-mc-ctl.service startup failed when selinux is on -* Wed Dec 1 2021 luoshengwei - 0.6.6-7 +* Thu Mar 23 2023 renhongxun - 0.6.7-8 - Type:bugfix - ID:NA - SUG:NA -- DESC: Since the cpu logical index has been counted in kernel, remove -- related code in ras. +- DESC:backport patches from upstream -* Wed Oct 27 2021 luoshengwei - 0.6.6-6 -- Type:feature +* Thu Feb 16 2023 Lv Ying - 0.6.7-7 +- Type:bugfix - ID:NA - SUG:NA -- DESC: Sync three patches, add cpu online fault isolation. +- DESC:rasdaemon/diskerror: fix incomplete diskerror log -* Wed Oct 20 2021 tanxiaofei - 0.6.6-5 -- Type:Bugfix +* Thu Oct 27 2022 Lei Feng - 0.6.7-6 +- Type:bugfix - ID:NA - SUG:NA -- DESC: Backport one patch, and some little fixes and add some modules - support for kunpeng series: - 1. Modify non-standard error decoding interface using linked list - 2. Fix the issue of sprintf data type mismatch in uuid_le() - 3. Fix the issue of command option -r for hip08 - 4. Fix some print format issues for hisi common error section - 5. Add some modules supported by hisi common error section - -* Sat July 29 2021 tanxiaofei - 0.6.6-4 +- DESC: + Add the following patch to fix startup core dumped issue. + 0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch + +* Mon May 23 2022 Shiju Jose - 0.6.7-5 - Type:feature - ID:NA - SUG:NA -- DESC:Add support for hisilicon common section that some IIO devices may -- used in new firmware of Kunpeng920, and Kunpeng930 will also use it too. - -* Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-3 +- DESC: + Update with the latest patches for the + 1. CPU online fault isolation for arm event. + 2. Modify recording Hisilicon common error data in the rasdaemon + 3. In the ras-mc-ctl, + 3.1. Improve Hisilicon common error statistics. + 3.2. Add support to display the HiSilicon vendor-errors for a specified module. + 3.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options. + 3.4. Reformat error info of the HiSilicon Kunpeng920. + 3.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. + 3.6. Updated the HiSilicon platform name as KunPeng9xx. + 4. Fixed a memory out-of-bounds issue in the rasdaemon. + +* Mon Mar 07 2022 Shiju Jose - 0.6.7-4 +- Type:feature +- ID:NA +- SUG:NA +- DESC: + 1. Modify recording Hisilicon common error data in the rasdaemon and + 2. In the ras-mc-ctl, + 2.1. Improve Hisilicon common error statistics. + 2.2. Add support to display the HiSilicon vendor-errors for a specified module. + 2.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options. + 2.4. Reformat error info of the HiSilicon Kunpeng920. + 2.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. + +* Wed Mar 2 2022 tanxiaofei - 0.6.7-3 - Type:bugfix - ID:NA - SUG:NA -- DESC:fix error print in handle_ras_events +- DESC: + 1. Backport 4 patches from openEuler master branch. + 1) Fix the issue of sprintf data type mismatch in uuid_le() + 2) Fix the issue of command option -r for hip08 + 3) Fix some print format issues for hisi common error section + 4) Add some modules supported by hisi common error section + 2.Enable compilation of the feature memory fault prediction based on + corrected error. + 3.Fix changelog date error of this spec file. + +* Wed Feb 23 2022 luoshengwei - 0.6.7-2 +- Type:feature +- ID:NA +- SUG:NA +- DESC: Add cpu online fault isolation for arm event. + +* Wed Dec 8 2021 xujing - 0.6.7-1 +- Update software to v0.6.7 + +* Thu Jul 29 2021 tanxiaofei - 0.6.6-6 +- Type:feature +- ID:NA +- SUG:NA +- DESC:Add support for hisilicon common section that some IIO devices may +- used in new firmware of Kunpeng920, and Kunpeng930 will also use it too. -* Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-2 +* Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-5 - Type:bugfix - ID:NA - SUG:NA - DESC:fix disk error log storm +* Wed Apr 28 2021 Lv Ying - 0.6.6-4 +- backport bugfix patches from community: + 1. Fix error print handle_ras_events. + +* Wed Mar 31 2021 Lv Ying - 0.6.6-3 +- backport bugfix patches from community: + 1. ras-page-isolation: do_page_offline always considers page offline was successful + 2. ras-page-isolation: page which is PAGE_OFFLINE_FAILED can be offlined again + +* Fri Sep 25 2020 openEuler Buildteam - 0.6.6-2 +- Update software source URL + * Fri Jul 24 2020 openEuler Buildteam - 0.6.6-1 - Update software to v0.6.6 diff --git a/rasdaemon.yaml b/rasdaemon.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cbadd1929fb70cbc049a6216722360cfa593e068 --- /dev/null +++ b/rasdaemon.yaml @@ -0,0 +1,4 @@ +version_control: github +src_repo: mchehab/rasdaemon +tag_prefix: ^v +seperator: . \ No newline at end of file