From d46cbe4559ce6d2497f3073b1672e7d4e74bca88 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Wed, 23 Feb 2022 17:32:41 +0800 Subject: [PATCH] add cpu online fault isolation for arm event (cherry picked from commit ac231c1c3131299c48d46780887ffb469d677de5) --- ...fault-isolation-for-corrected-errors.patch | 906 ++++++++++++++++++ ...ult-isolation-for-recoverable-errors.patch | 138 +++ rasdaemon.spec | 10 +- 3 files changed, 1053 insertions(+), 1 deletion(-) create mode 100644 0001-Support-cpu-fault-isolation-for-corrected-errors.patch create mode 100644 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch diff --git a/0001-Support-cpu-fault-isolation-for-corrected-errors.patch b/0001-Support-cpu-fault-isolation-for-corrected-errors.patch new file mode 100644 index 0000000..d5460de --- /dev/null +++ b/0001-Support-cpu-fault-isolation-for-corrected-errors.patch @@ -0,0 +1,906 @@ +From a8e02e7d3d910eb7d049fd4126d53b8d3121d798 Mon Sep 17 00:00:00 2001 +From: Shengwei Luo +Date: Wed, 23 Feb 2022 17:21:58 +0800 +Subject: [PATCH 1/2] Support cpu fault isolation for corrected errors + +When the corrected errors exceed the set limit in cycle, try to +offline the related cpu core. + +Signed-off-by: Shengwei Luo +--- + Makefile.am | 6 +- + configure.ac | 11 ++ + misc/rasdaemon.env | 17 ++ + queue.c | 121 ++++++++++++++ + queue.h | 39 +++++ + ras-arm-handler.c | 84 ++++++++++ + ras-arm-handler.h | 18 +++ + ras-cpu-isolation.c | 378 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 68 ++++++++ + ras-events.c | 9 +- + 10 files changed, 749 insertions(+), 2 deletions(-) + create mode 100644 queue.c + create mode 100644 queue.h + create mode 100644 ras-cpu-isolation.c + create mode 100644 ras-cpu-isolation.h + +diff --git a/Makefile.am b/Makefile.am +index fabca78..242ceb7 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -63,13 +63,17 @@ endif + if WITH_AMP_NS_DECODE + rasdaemon_SOURCES += non-standard-ampere.c + endif ++if WITH_CPU_FAULT_ISOLATION ++ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c ++endif + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a + + include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ +- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h ++ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ ++ ras-cpu-isolation.h queue.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 33b81fe..d098fcf 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], + AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) + ++AC_ARG_ENABLE([cpu_fault_isolation], ++ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation])) ++ ++AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation") ++ AC_SUBST([WITH_CPU_FAULT_ISOLATION]) ++]) ++AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -201,4 +211,5 @@ compile time options summary + Memory Failure : $USE_MEMORY_FAILURE + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE ++ CPU fault isolation : $USE_CPU_FAULT_ISOLATION + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 12fd766..7cb18e8 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" + # soft-then-hard First try to soft offline, then try hard offlining. + # Note: default offline choice is "soft". + PAGE_CE_ACTION="soft" ++ ++# CPU Online Fault Isolation ++# Whether to enable cpu online fault isolation (yes|no). ++CPU_ISOLATION_ENABLE="no" ++# Specify the threshold of CE numbers. ++# ++# Format: ++# [0-9]+[unit] ++# ++# Supported units: ++# CPU_CE_THRESHOLD: no unit ++# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second ++CPU_CE_THRESHOLD="18" ++CPU_ISOLATION_CYCLE="24h" ++ ++# Prevent excessive isolation from causing an avalanche effect ++CPU_ISOLATION_LIMIT="10" +\ No newline at end of file +diff --git a/queue.c b/queue.c +new file mode 100644 +index 0000000..ed66798 +--- /dev/null ++++ b/queue.c +@@ -0,0 +1,121 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++#include ++#include ++#include "queue.h" ++#include "ras-logger.h" ++ ++int is_empty(struct link_queue *queue) ++{ ++ if (queue) ++ return queue->size == 0; ++ ++ return 1; ++} ++ ++struct link_queue *init_queue(void) ++{ ++ struct link_queue *queue = NULL; ++ ++ queue = (struct link_queue *)malloc(sizeof(struct link_queue)); ++ ++ if (queue == NULL) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); ++ return NULL; ++ } ++ ++ queue->size = 0; ++ queue->head = NULL; ++ queue->tail = NULL; ++ ++ return queue; ++} ++ ++void clear_queue(struct link_queue *queue) ++{ ++ if (queue == NULL) ++ return; ++ ++ struct queue_node *node = queue->head; ++ struct queue_node *tmp = NULL; ++ ++ while (node != NULL) { ++ tmp = node; ++ node = node->next; ++ free(tmp); ++ } ++ ++ queue->head = NULL; ++ queue->tail = NULL; ++ queue->size = 0; ++} ++ ++void free_queue(struct link_queue *queue) ++{ ++ clear_queue(queue); ++ ++ if (queue) ++ free(queue); ++} ++ ++/* It should be guranteed that the param is not NULL */ ++void push(struct link_queue *queue, struct queue_node *node) ++{ ++ /* there is no element in the queue */ ++ if (queue->head == NULL) ++ queue->head = node; ++ else ++ queue->tail->next = node; ++ ++ queue->tail = node; ++ (queue->size)++; ++} ++ ++int pop(struct link_queue *queue) ++{ ++ struct queue_node *tmp = NULL; ++ ++ if (queue == NULL || is_empty(queue)) ++ return -1; ++ ++ tmp = queue->head; ++ queue->head = queue->head->next; ++ free(tmp); ++ (queue->size)--; ++ ++ return 0; ++} ++ ++struct queue_node *front(struct link_queue *queue) ++{ ++ if (queue == NULL) ++ return NULL; ++ ++ return queue->head; ++} ++ ++struct queue_node *node_create(time_t time, unsigned int value) ++{ ++ struct queue_node *node = NULL; ++ ++ node = (struct queue_node *)malloc(sizeof(struct queue_node)); ++ ++ if (node != NULL) { ++ node->time = time; ++ node->value = value; ++ node->next = NULL; ++ } ++ ++ return node; ++} +diff --git a/queue.h b/queue.h +new file mode 100644 +index 0000000..5459f40 +--- /dev/null ++++ b/queue.h +@@ -0,0 +1,39 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#ifndef __RAS_QUEUE_H ++#define __RAS_QUEUE_H ++ ++struct queue_node { ++ time_t time; ++ unsigned int value; ++ struct queue_node *next; ++}; ++ ++struct link_queue { ++ struct queue_node *head; ++ struct queue_node *tail; ++ int size; ++}; ++ ++int is_empty(struct link_queue *queue); ++struct link_queue *init_queue(void); ++void clear_queue(struct link_queue *queue); ++void free_queue(struct link_queue *queue); ++void push(struct link_queue *queue, struct queue_node *node); ++int pop(struct link_queue *queue); ++struct queue_node *front(struct link_queue *queue); ++struct queue_node *node_create(time_t time, unsigned int value); ++ ++#endif +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 1149dc6..c9ef2fd 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -22,6 +22,10 @@ + #include "ras-report.h" + #include "ras-non-standard-handler.h" + #include "non-standard-ampere.h" ++#include "ras-cpu-isolation.h" ++ ++#define ARM_ERR_VALID_ERROR_COUNT BIT(0) ++#define ARM_ERR_VALID_FLAGS BIT(1) + + void display_raw_data(struct trace_seq *s, + const uint8_t *buf, +@@ -42,6 +46,44 @@ void display_raw_data(struct trace_seq *s, + } + } + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++static int count_errors(struct ras_arm_event *ev) ++{ ++ struct ras_arm_err_info *err_info; ++ int num_pei; ++ int err_info_size = sizeof(struct ras_arm_err_info); ++ int num = 0; ++ int i; ++ int error_count; ++ ++ if (ev->pei_len % err_info_size != 0) { ++ log(TERM, LOG_ERR, ++ "The event data does not match to the ARM Processor Error Information Structure\n"); ++ return num; ++ } ++ num_pei = ev->pei_len / err_info_size; ++ err_info = (struct ras_arm_err_info *)(ev->pei_error); ++ ++ for (i = 0; i < num_pei; ++i) { ++ error_count = 1; ++ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) { ++ /* ++ * The value of this field is defined as follows: ++ * 0: Single Error ++ * 1: Multiple Errors ++ * 2-65535: Error Count ++ */ ++ error_count = err_info->multiple_error + 1; ++ } ++ ++ num += error_count; ++ err_info += 1; ++ } ++ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num); ++ return num; ++} ++#endif ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context) +@@ -139,6 +181,48 @@ int ras_arm_event_handler(struct trace_seq *s, + display_raw_data(s, ev.vsei_error, ev.oem_len); + #endif + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ int cpu; ++ int nums; ++ char *severity; ++ struct error_info err_info; ++ ++ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) ++ return -1; ++ cpu = val; ++ trace_seq_printf(s, "\n cpu: %d", cpu); ++ ++ /* record cpu error */ ++ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) ++ return -1; ++ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */ ++ switch (val) { ++ case GHES_SEV_NO: ++ severity = "Informational"; ++ break; ++ case GHES_SEV_CORRECTED: ++ severity = "Corrected"; ++ break; ++ case GHES_SEV_RECOVERABLE: ++ severity = "Recoverable"; ++ break; ++ default: ++ case GHES_SEV_PANIC: ++ severity = "Fatal"; ++ } ++ trace_seq_printf(s, "\n severity: %s", severity); ++ ++ if (val == GHES_SEV_CORRECTED) { ++ nums = count_errors(&ev); ++ if (nums > 0) { ++ err_info.nums = nums; ++ err_info.time = now; ++ err_info.err_type = val; ++ ras_record_cpu_error(&err_info, cpu); ++ } ++ } ++#endif ++ + /* Insert data into the SGBD */ + #ifdef HAVE_SQLITE3 + ras_store_arm_record(ras, &ev); +diff --git a/ras-arm-handler.h b/ras-arm-handler.h +index 563a2d3..52813e7 100644 +--- a/ras-arm-handler.h ++++ b/ras-arm-handler.h +@@ -17,6 +17,24 @@ + #include "ras-events.h" + #include "libtrace/event-parse.h" + ++/* ++ * ARM Processor Error Information Structure, According to ++ * UEFI_2_9 specification chapter N2.4.4. ++ */ ++#pragma pack(1) ++struct ras_arm_err_info { ++ uint8_t version; ++ uint8_t length; ++ uint16_t validation_bits; ++ uint8_t type; ++ uint16_t multiple_error; ++ uint8_t flags; ++ uint64_t error_info; ++ uint64_t virt_fault_addr; ++ uint64_t physical_fault_addr; ++}; ++#pragma pack() ++ + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context); +diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c +new file mode 100644 +index 0000000..8c0cdf9 +--- /dev/null ++++ b/ras-cpu-isolation.c +@@ -0,0 +1,378 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "ras-logger.h" ++#include "ras-cpu-isolation.h" ++ ++static struct cpu_info *cpu_infos; ++static unsigned int ncores; ++static unsigned int enabled = 1; ++static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; ++ ++static const struct param normal_units[] = { ++ {"", 1}, ++ {} ++}; ++ ++static const struct param cycle_units[] = { ++ {"d", 24 * 60 * 60}, ++ {"h", 60 * 60}, ++ {"m", 60}, ++ {"s", 1}, ++ {} ++}; ++ ++static struct isolation_param threshold = { ++ .name = "CPU_CE_THRESHOLD", ++ .units = normal_units, ++ .value = 18, ++ .limit = 10000 ++}; ++ ++static struct isolation_param cpu_limit = { ++ .name = "CPU_ISOLATION_LIMIT", ++ .units = normal_units ++}; ++ ++static struct isolation_param cycle = { ++ .name = "CPU_ISOLATION_CYCLE", ++ .units = cycle_units, ++ .value = 24 * 60 * 60, ++ .limit = 30 * 24 * 60 * 60 ++}; ++ ++static const char * const cpu_state[] = { ++ [CPU_OFFLINE] = "offline", ++ [CPU_ONLINE] = "online", ++ [CPU_OFFLINE_FAILED] = "offline-failed", ++ [CPU_UNKNOWN] = "unknown" ++}; ++ ++static int open_sys_file(unsigned int cpu, int __oflag, const char *format) ++{ ++ int fd; ++ char buf[MAX_PATH_LEN] = ""; ++ ++ snprintf(buf, sizeof(buf), format, cpu); ++ fd = open(buf, __oflag); ++ ++ if (fd == -1) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); ++ return -1; ++ } ++ ++ return fd; ++} ++ ++static int get_cpu_status(unsigned int cpu) ++{ ++ int fd, num; ++ char buf[2] = ""; ++ ++ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); ++ if (fd == -1) ++ return CPU_UNKNOWN; ++ ++ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) ++ num = CPU_UNKNOWN; ++ ++ close(fd); ++ ++ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; ++} ++ ++static int init_cpu_info(unsigned int cpus) ++{ ++ ncores = cpus; ++ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); ++ ++ if (!cpu_infos) { ++ log(TERM, LOG_ERR, ++ "Failed to allocate memory for cpu infos in %s.\n", __func__); ++ return -1; ++ } ++ ++ for (unsigned int i = 0; i < cpus; ++i) { ++ cpu_infos[i].ce_nums = 0; ++ cpu_infos[i].state = get_cpu_status(i); ++ cpu_infos[i].ce_queue = init_queue(); ++ ++ if (cpu_infos[i].ce_queue == NULL) { ++ log(TERM, LOG_ERR, ++ "Failed to allocate memory for cpu ce queue in %s.\n", __func__); ++ return -1; ++ } ++ } ++ /* set limit of offlined cpu limit according to number of cpu */ ++ cpu_limit.limit = cpus - 1; ++ cpu_limit.value = 0; ++ ++ return 0; ++} ++ ++static void check_config(struct isolation_param *config) ++{ ++ if (config->value > config->limit) { ++ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", ++ config->value, config->limit); ++ config->value = config->limit; ++ } ++} ++ ++static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) ++{ ++ char *unit = NULL; ++ int env_size, has_unit = 0; ++ ++ if (!env || strlen(env) == 0) ++ return -1; ++ ++ env_size = strlen(env); ++ unit = env + env_size - 1; ++ ++ if (isalpha(*unit)) { ++ has_unit = 1; ++ env_size--; ++ if (env_size <= 0) ++ return -1; ++ } ++ ++ for (int i = 0; i < env_size; ++i) { ++ if (isdigit(env[i])) { ++ if (*value > ULONG_MAX / 10 || ++ (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { ++ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = 10 * (*value) + (env[i] - '0'); ++ } else ++ return -1; ++ } ++ ++ if (has_unit) { ++ for (const struct param *units = config->units; units->name; units++) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ if (*value > (ULONG_MAX / units->value)) { ++ log(TERM, LOG_ERR, ++ "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = (*value) * units->value; ++ return 0; ++ } ++ } ++ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void init_config(struct isolation_param *config) ++{ ++ char *env = getenv(config->name); ++ unsigned long value = 0; ++ ++ if (parse_ul_config(config, env, &value) < 0) { ++ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", ++ config->name, env, config->value); ++ return; ++ } ++ ++ config->value = value; ++ check_config(config); ++} ++ ++static int check_config_status(void) ++{ ++ char *env = getenv("CPU_ISOLATION_ENABLE"); ++ ++ if (env == NULL || strcasecmp(env, "yes")) ++ return -1; ++ ++ return 0; ++} ++ ++void ras_cpu_isolation_init(unsigned int cpus) ++{ ++ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { ++ enabled = 0; ++ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); ++ return; ++ } ++ ++ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); ++ init_config(&threshold); ++ init_config(&cpu_limit); ++ init_config(&cycle); ++} ++ ++void cpu_infos_free(void) ++{ ++ if (cpu_infos) { ++ for (int i = 0; i < ncores; ++i) ++ free_queue(cpu_infos[i].ce_queue); ++ ++ free(cpu_infos); ++ } ++} ++ ++static int do_cpu_offline(unsigned int cpu) ++{ ++ int fd, rc; ++ char buf[2] = ""; ++ ++ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; ++ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); ++ if (fd == -1) ++ return HANDLE_FAILED; ++ ++ strcpy(buf, "0"); ++ rc = write(fd, buf, strlen(buf)); ++ ++ if (rc < 0) { ++ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ close(fd); ++ return HANDLE_FAILED; ++ } ++ ++ close(fd); ++ /* check wthether the cpu is isolated successfully */ ++ cpu_infos[cpu].state = get_cpu_status(cpu); ++ ++ if (cpu_infos[cpu].state == CPU_OFFLINE) ++ return HANDLE_SUCCEED; ++ ++ return HANDLE_FAILED; ++} ++ ++static int do_ce_handler(unsigned int cpu) ++{ ++ struct link_queue *queue = cpu_infos[cpu].ce_queue; ++ unsigned int tmp; ++ /* ++ * Since we just count all error numbers in setted cycle, we store the time ++ * and error numbers from current event to the queue, then everytime we ++ * calculate the period from beginning time to ending time, if the period ++ * exceeds setted cycle, we pop the beginning time and error until the period ++ * from new beginning time to ending time is less than cycle. ++ */ ++ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { ++ tmp = queue->head->value; ++ if (pop(queue) == 0) ++ cpu_infos[cpu].ce_nums -= tmp; ++ } ++ log(TERM, LOG_INFO, ++ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n", ++ cpu, cpu_infos[cpu].ce_nums); ++ ++ if (cpu_infos[cpu].ce_nums >= threshold.value) { ++ log(TERM, LOG_INFO, ++ "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", ++ threshold.value, cpu); ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; ++} ++ ++static int error_handler(unsigned int cpu, struct error_info *err_info) ++{ ++ int ret = HANDLE_NOTHING; ++ ++ switch (err_info->err_type) { ++ case CE: ++ ret = do_ce_handler(cpu); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static void record_error_info(unsigned int cpu, struct error_info *err_info) ++{ ++ switch (err_info->err_type) { ++ case CE: ++ { ++ struct queue_node *node = node_create(err_info->time, err_info->nums); ++ ++ if (node == NULL) { ++ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); ++ return; ++ } ++ push(cpu_infos[cpu].ce_queue, node); ++ cpu_infos[cpu].ce_nums += err_info->nums; ++ break; ++ } ++ default: ++ break; ++ } ++} ++ ++void ras_record_cpu_error(struct error_info *err_info, int cpu) ++{ ++ int ret; ++ ++ if (enabled == 0) ++ return; ++ ++ if (cpu >= ncores || cpu < 0) { ++ log(TERM, LOG_ERR, ++ "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); ++ return; ++ } ++ ++ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); ++ cpu_infos[cpu].state = get_cpu_status(cpu); ++ ++ if (cpu_infos[cpu].state != CPU_ONLINE) { ++ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); ++ return; ++ } ++ ++ record_error_info(cpu, err_info); ++ /* ++ * Since user may change cpu state, we get current offlined ++ * cpu numbers every recording time. ++ */ ++ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { ++ log(TERM, LOG_WARNING, ++ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", ++ cpu_limit.value); ++ return; ++ } ++ ++ ret = error_handler(cpu, err_info); ++ ++ if (ret == HANDLE_NOTHING) ++ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); ++ else if (ret == HANDLE_SUCCEED) { ++ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); ++ clear_queue(cpu_infos[cpu].ce_queue); ++ cpu_infos[cpu].ce_nums = 0; ++ } else ++ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); ++} +diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h +new file mode 100644 +index 0000000..1159853 +--- /dev/null ++++ b/ras-cpu-isolation.h +@@ -0,0 +1,68 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#ifndef __RAS_CPU_ISOLATION_H ++#define __RAS_CPU_ISOLATION_H ++ ++#include "queue.h" ++ ++#define MAX_PATH_LEN 100 ++#define MAX_BUF_LEN 1024 ++ ++struct param { ++ char *name; ++ unsigned long value; ++}; ++ ++struct isolation_param { ++ char *name; ++ const struct param *units; ++ unsigned long value; ++ unsigned long limit; ++}; ++ ++enum cpu_state { ++ CPU_OFFLINE, ++ CPU_ONLINE, ++ CPU_OFFLINE_FAILED, ++ CPU_UNKNOWN, ++}; ++ ++enum error_handle_result { ++ HANDLE_FAILED = -1, ++ HANDLE_SUCCEED, ++ HANDLE_NOTHING, ++}; ++ ++enum error_type { ++ CE = 1 ++}; ++ ++struct cpu_info { ++ unsigned long ce_nums; ++ struct link_queue *ce_queue; ++ enum cpu_state state; ++}; ++ ++struct error_info { ++ unsigned long nums; ++ time_t time; ++ enum error_type err_type; ++}; ++ ++void ras_cpu_isolation_init(unsigned int cpus); ++void ras_record_cpu_error(struct error_info *err_info, int cpu); ++void cpu_infos_free(void); ++ ++#endif +diff --git a/ras-events.c b/ras-events.c +index ba769d1..491c17a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -41,6 +41,7 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" ++#include "ras-cpu-isolation.h" + + /* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never +@@ -879,6 +880,10 @@ int handle_ras_events(int record_events) + + cpus = get_num_cpus(ras); + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ ras_cpu_isolation_init(cpus); ++#endif ++ + #ifdef HAVE_MCE + rc = register_mce_handler(ras, cpus); + if (rc) +@@ -1005,6 +1010,8 @@ err: + } + free(ras); + } +- ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ cpu_infos_free(); ++#endif + return rc; + } +-- +2.27.0 + diff --git a/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch b/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch new file mode 100644 index 0000000..aa1b251 --- /dev/null +++ b/0002-Support-cpu-fault-isolation-for-recoverable-errors.patch @@ -0,0 +1,138 @@ +From e0101e59c6887a98d3a5a1b622c75f5307e8ec19 Mon Sep 17 00:00:00 2001 +From: Shengwei Luo +Date: Wed, 23 Feb 2022 17:23:27 +0800 +Subject: [PATCH 2/2] Support cpu fault isolation for recoverable errors + +When the recoverable errors in cpu core occurred, try to offline +the related cpu core. + +Signed-off-by: Shengwei Luo +--- + ras-arm-handler.c | 21 ++++++++++++++++++--- + ras-cpu-isolation.c | 17 +++++++++++++++++ + ras-cpu-isolation.h | 4 +++- + 3 files changed, 38 insertions(+), 4 deletions(-) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index c9ef2fd..dae5ad6 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -47,7 +47,20 @@ void display_raw_data(struct trace_seq *s, + } + + #ifdef HAVE_CPU_FAULT_ISOLATION +-static int count_errors(struct ras_arm_event *ev) ++static int is_core_failure(struct ras_arm_err_info *err_info) ++{ ++ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) { ++ /* ++ * core failure: ++ * Bit 0\1\3: (at lease 1) ++ * Bit 2: 0 ++ */ ++ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << 2)); ++ } ++ return 0; ++} ++ ++static int count_errors(struct ras_arm_event *ev, int sev) + { + struct ras_arm_err_info *err_info; + int num_pei; +@@ -75,6 +88,8 @@ static int count_errors(struct ras_arm_event *ev) + */ + error_count = err_info->multiple_error + 1; + } ++ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info)) ++ error_count = 0; + + num += error_count; + err_info += 1; +@@ -212,8 +227,8 @@ int ras_arm_event_handler(struct trace_seq *s, + } + trace_seq_printf(s, "\n severity: %s", severity); + +- if (val == GHES_SEV_CORRECTED) { +- nums = count_errors(&ev); ++ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { ++ nums = count_errors(&ev, val); + if (nums > 0) { + err_info.nums = nums; + err_info.time = now; +diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c +index 8c0cdf9..e650022 100644 +--- a/ras-cpu-isolation.c ++++ b/ras-cpu-isolation.c +@@ -113,6 +113,7 @@ static int init_cpu_info(unsigned int cpus) + + for (unsigned int i = 0; i < cpus; ++i) { + cpu_infos[i].ce_nums = 0; ++ cpu_infos[i].uce_nums = 0; + cpu_infos[i].state = get_cpu_status(i); + cpu_infos[i].ce_queue = init_queue(); + +@@ -295,6 +296,15 @@ static int do_ce_handler(unsigned int cpu) + return HANDLE_NOTHING; + } + ++static int do_uce_handler(unsigned int cpu) ++{ ++ if (cpu_infos[cpu].uce_nums > 0) { ++ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%d\n", cpu); ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; ++} ++ + static int error_handler(unsigned int cpu, struct error_info *err_info) + { + int ret = HANDLE_NOTHING; +@@ -303,6 +313,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) + case CE: + ret = do_ce_handler(cpu); + break; ++ case UCE: ++ ret = do_uce_handler(cpu); ++ break; + default: + break; + } +@@ -325,6 +338,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) + cpu_infos[cpu].ce_nums += err_info->nums; + break; + } ++ case UCE: ++ cpu_infos[cpu].uce_nums++; ++ break; + default: + break; + } +@@ -372,6 +388,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) + cpu, cpu_state[cpu_infos[cpu].state]); + clear_queue(cpu_infos[cpu].ce_queue); + cpu_infos[cpu].ce_nums = 0; ++ cpu_infos[cpu].uce_nums = 0; + } else + log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", + cpu, cpu_state[cpu_infos[cpu].state]); +diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h +index 1159853..024a68b 100644 +--- a/ras-cpu-isolation.h ++++ b/ras-cpu-isolation.h +@@ -46,10 +46,12 @@ enum error_handle_result { + }; + + enum error_type { +- CE = 1 ++ CE = 1, ++ UCE + }; + + struct cpu_info { ++ unsigned long uce_nums; + unsigned long ce_nums; + struct link_queue *ce_queue; + enum cpu_state state; +-- +2.27.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 89c567d..cba988a 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 1 +Release: 2 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -23,6 +23,8 @@ Patch1: bugfix-rasdaemon-wait-for-file-access.patch Patch2: bugfix-fix-fd-check.patch Patch3: bugfix-fix-disk-error-log-storm.patch Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch +Patch5: 0001-Support-cpu-fault-isolation-for-corrected-errors.patch +Patch6: 0002-Support-cpu-fault-isolation-for-recoverable-errors.patch %description The rasdaemon program is a daemon which monitors the platform @@ -68,6 +70,12 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Wed Feb 23 2022 luoshengwei - 0.6.7-2 +- Type:feature +- ID:NA +- SUG:NA +- DESC: Add cpu online fault isolation for arm event. + * Wed Dec 8 2021 xujing - 0.6.7-1 - Update software to v0.6.7 -- Gitee