From dab263b7539af553bd33ee2d1c4482e730731547 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Fri, 21 May 2021 16:48:32 +0800 Subject: [PATCH 01/15] add patch of cpu online fault isolation --- 0006-add-cpu-online-fault-isolation.patch | 983 ++++++++++++++++++ ...add-trace-print-and-add-sqlite-store.patch | 77 ++ rasdaemon.spec | 9 +- 3 files changed, 1067 insertions(+), 2 deletions(-) create mode 100644 0006-add-cpu-online-fault-isolation.patch create mode 100644 0007-add-trace-print-and-add-sqlite-store.patch diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch new file mode 100644 index 0000000..44bae48 --- /dev/null +++ b/0006-add-cpu-online-fault-isolation.patch @@ -0,0 +1,983 @@ +From fcdc2d2f99def2745c261a085f7aacc8ce106246 Mon Sep 17 00:00:00 2001 +From: Lostwayzxc +Date: Fri, 21 May 2021 16:28:15 +0800 +Subject: [PATCH 1/2] add cpu online fault isolation + +Add cpu online fault isolation, when CE/UCE occurs, we choose to offline +the error cpu according to threshold algorithm. + +Signed-off-by: Luo Shengwei +--- + .travis.yml | 2 +- + Makefile.am | 6 +- + configure.ac | 11 ++ + misc/rasdaemon.env | 15 ++ + queue.c | 121 ++++++++++++ + queue.h | 42 ++++ + ras-arm-handler.c | 73 +++++++ + ras-cpu-isolation.c | 469 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 70 +++++++ + ras-events.c | 8 + + ras-record.h | 5 + + 11 files changed, 820 insertions(+), 2 deletions(-) + create mode 100644 queue.c + create mode 100644 queue.h + create mode 100644 ras-cpu-isolation.c + create mode 100644 ras-cpu-isolation.h + +diff --git a/.travis.yml b/.travis.yml +index 79cf4ca..5ab3957 100644 +--- a/.travis.yml ++++ b/.travis.yml +@@ -20,7 +20,7 @@ before_install: + - sudo apt-get install -y sqlite3 + install: + - autoreconf -vfi +-- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa ++- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa --enable-cpu-fault-isolation + + script: + - make && sudo make install +diff --git a/Makefile.am b/Makefile.am +index f4822b9..6431dd3 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -57,12 +57,16 @@ endif + if WITH_MEMORY_CE_PFA + rasdaemon_SOURCES += rbtree.c ras-page-isolation.c + endif ++if WITH_CPU_FAULT_ISOLATION ++ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c ++endif + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a + + include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ +- ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h ++ ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ ++ ras-cpu-isolation.h queue.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 2d6c59c..a682bb9 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -141,6 +141,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], + AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"]) + ++AC_ARG_ENABLE([cpu_fault_isolation], ++ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation])) ++ ++AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation") ++ AC_SUBST([WITH_CPU_FAULT_ISOLATION]) ++]) ++AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -173,4 +183,5 @@ compile time options summary + DEVLINK : $USE_DEVLINK + Disk I/O errors : $USE_DISKERROR + Memory CE PFA : $USE_MEMORY_CE_PFA ++ CPU fault isolation : $USE_CPU_FAULT_ISOLATION + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 12fd766..7498992 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -27,3 +27,18 @@ PAGE_CE_THRESHOLD="50" + # soft-then-hard First try to soft offline, then try hard offlining. + # Note: default offline choice is "soft". + PAGE_CE_ACTION="soft" ++ ++# CPU Online Fault Isolation ++# Specify the threshold of corrected errors. ++# ++# Format: ++# [0-9]+[unit] ++# ++# Supported units: ++# CPU_CE_THRESHOLD: no unit ++# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second ++CPU_CE_THRESHOLD="18" ++CPU_ISOLATION_CYCLE="24h" ++ ++# Prevent excessive isolation from causing an avalanche effect ++CPU_ISOLATION_LIMIT="10" +diff --git a/queue.c b/queue.c +new file mode 100644 +index 0000000..1ef8688 +--- /dev/null ++++ b/queue.c +@@ -0,0 +1,121 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++*/ ++#include ++#include ++#include "queue.h" ++#include "ras-logger.h" ++ ++ ++int is_empty(struct link_queue *queue) ++{ ++ if (queue) { ++ return queue->size == 0; ++ } ++ ++ return 1; ++} ++ ++struct link_queue* init_queue(void) ++{ ++ struct link_queue* queue; ++ queue = (struct link_queue*) malloc(sizeof(struct link_queue)); ++ ++ if (queue == NULL) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); ++ return NULL; ++ } ++ ++ queue->size = 0; ++ queue->head = NULL; ++ queue->tail = NULL; ++ ++ return queue; ++} ++ ++void clear_queue(struct link_queue *queue) ++{ ++ if (queue == NULL) { ++ return; ++ } ++ ++ struct queue_node *node = queue->head; ++ struct queue_node *tmp = NULL; ++ ++ while (node != NULL) { ++ tmp = node; ++ node = node->next; ++ free(tmp); ++ } ++ ++ free(queue); ++} ++ ++int push(struct link_queue *queue, struct queue_node *node) ++{ ++ if (!queue || !node) { ++ return -1; ++ } ++ /* there is no element in the queue */ ++ if (queue->head == NULL) { ++ queue->head = node; ++ } ++ else { ++ node->next = queue->tail->next; ++ queue->tail->next = node; ++ } ++ ++ queue->tail = node; ++ (queue->size)++; ++ ++ return 0; ++} ++ ++int pop(struct link_queue *queue) ++{ ++ struct queue_node *tmp = NULL; ++ ++ if (is_empty(queue)) { ++ return -1; ++ } ++ ++ tmp = queue->head; ++ queue->head = queue->head->next; ++ free(tmp); ++ (queue->size)--; ++ ++ return 0; ++} ++ ++struct queue_node* front(struct link_queue *queue) ++{ ++ if (is_empty(queue)) { ++ return NULL; ++ } ++ ++ return queue->head; ++} ++ ++struct queue_node* node_create(time_t time, unsigned value) ++{ ++ struct queue_node *node = NULL; ++ node = (struct queue_node*) malloc(sizeof(struct queue_node)); ++ ++ if (node != NULL) { ++ node->time = time; ++ node->value = value; ++ node->next = NULL; ++ } ++ ++ return node; ++} +diff --git a/queue.h b/queue.h +new file mode 100644 +index 0000000..b60aa81 +--- /dev/null ++++ b/queue.h +@@ -0,0 +1,42 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++*/ ++ ++#ifndef __RAS_QUEUE_H ++#define __RAS_QUEUE_H ++ ++ ++struct queue_node ++{ ++ time_t time; ++ unsigned value; ++ struct queue_node *next; ++}; ++ ++struct link_queue ++{ ++ struct queue_node *head; ++ struct queue_node *tail; ++ int size; ++}; ++ ++int is_empty(struct link_queue *queue); ++struct link_queue* init_queue(void); ++void clear_queue(struct link_queue *queue); ++int push(struct link_queue *queue, struct queue_node *node); ++int pop(struct link_queue *queue); ++struct queue_node* front(struct link_queue *queue); ++struct queue_node* node_create(time_t time, unsigned value); ++ ++ ++#endif +\ No newline at end of file +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 2f170e2..9f9302e 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -20,6 +20,44 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" ++#include "ras-cpu-isolation.h" ++ ++ ++static int is_core_failure(unsigned long value) ++{ ++ /* ++ * core failure: ++ * Bit 0\1\3: (at lease 1) ++ * Bit 2: 0 ++ */ ++ return (value & 0xf) && !(value & (0x1 << 2)); ++} ++ ++static int count_errors(struct event_format *event, const uint8_t* data, int len) ++{ ++ /* ++ * According to UEFI_2_9_2021_03_18 specification chapter N2.4.4, ++ * the length of struct processor error information is 32, the byte ++ * length of the Flags field is 1, and the byte offset is 7 in the struct. ++ */ ++ int pei_err_size = 32; ++ int field_size = 1; ++ int cur_offset = 7; ++ unsigned long value; ++ int num = 0; ++ if (len % pei_err_size != 0) { ++ log(TERM, LOG_ERR, "the event data does not match to the ARM Processor Error Information Structure\n"); ++ return num; ++ } ++ while (cur_offset < len) { ++ value = pevent_read_number(event->pevent, data+cur_offset, field_size); ++ if (is_core_failure(value)) { ++ num++; ++ } ++ cur_offset += pei_err_size; ++ } ++ return num; ++} + + int ras_arm_event_handler(struct trace_seq *s, + struct pevent_record *record, +@@ -78,6 +116,41 @@ int ras_arm_event_handler(struct trace_seq *s, + ev.psci_state = val; + trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ /* record cpu error */ ++ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) ++ return -1; ++ /* error severity defined from include/acpi/ghes.h */ ++ switch (val) { ++ case GHES_SEV_NO: ++ ev.severity = "Informational"; ++ break; ++ case GHES_SEV_CORRECTED: ++ ev.severity = "Corrected"; ++ break; ++ case GHES_SEV_RECOVERABLE: ++ ev.severity = "Recoverable"; ++ break; ++ default: ++ case GHES_SEV_PANIC: ++ ev.severity = "Fatal"; ++ } ++ ++ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { ++ int len, nums; ++ ev.error_info = pevent_get_field_raw(s, event, "buf", record, &len, 1); ++ if (!ev.error_info) ++ return -1; ++ ev.length = len; ++ /* relate to enum error_type */ ++ nums = count_errors(event, ev.error_info, len); ++ if (nums > 0) { ++ struct error_info err_info = {nums, now, val}; ++ ras_record_cpu_error(&err_info, ev.mpidr); ++ } ++ } ++#endif ++ + /* Insert data into the SGBD */ + #ifdef HAVE_SQLITE3 + ras_store_arm_record(ras, &ev); +diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c +new file mode 100644 +index 0000000..857aa45 +--- /dev/null ++++ b/ras-cpu-isolation.c +@@ -0,0 +1,469 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "ras-logger.h" ++#include "ras-cpu-isolation.h" ++ ++static struct cpu_info* cpu_infos = NULL; ++static unsigned int cpu_nums, cores_per_socket, cores_per_die; ++static const char* cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; ++static const char* core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; ++static const char* node_path = "/sys/devices/system/node/node%d/cpulist"; ++ ++static const struct param normal_units[] = { ++ { "", 1 }, ++ {} ++}; ++ ++static const struct param cycle_units[] = { ++ { "d", 24 * 60 * 60 }, ++ { "h", 60 * 60 }, ++ { "m", 60 }, ++ { "s", 1 }, ++ {} ++}; ++ ++static struct isolation_param threshold = { ++ .name = "CPU_CE_THRESHOLD", ++ .units = normal_units, ++ .value = 18, ++ .limit = 100 ++}; ++ ++static struct isolation_param cpu_limit = { ++ .name = "CPU_ISOLATION_LIMIT", ++ .units = normal_units, ++ .value = 10, ++ .limit = 30 ++}; ++ ++static struct isolation_param cycle = { ++ .name = "CPU_ISOLATION_CYCLE", ++ .units = cycle_units, ++ .value = 24 * 60 * 60, ++ .limit = 72 * 60 * 60 ++}; ++ ++static const char *cpu_state[] = { ++ [CPU_OFFLINE] = "offline", ++ [CPU_ONLINE] = "online", ++ [CPU_OFFLINE_FAILED] = "offline-failed", ++ [CPU_UNKNOWN] = "unknown" ++}; ++ ++static int open_sys_file(unsigned cpu, int __oflag, const char *format) ++{ ++ int fd; ++ char buf[MAX_PATH_LEN]; ++ snprintf(buf, sizeof(buf), format, cpu); ++ fd = open(buf, __oflag); ++ ++ if (fd == -1) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); ++ return -1; ++ } ++ ++ return fd; ++} ++ ++static void set_socket(void) ++{ ++ int ncores, fd; ++ int begin, end; ++ char buf[20]; ++ ncores = sysconf(_SC_NPROCESSORS_CONF); ++ cores_per_socket = ncores; ++ ++ for (int i = 0; i < ncores; ++i) { ++ fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); ++ if (fd == -1) { ++ continue; ++ } ++ if (read(fd, buf, sizeof(buf)) <= 0) { ++ close(fd); ++ continue; ++ } ++ if (sscanf(buf, "%d-%d", &begin, &end) == 2) { ++ cores_per_socket = end > begin ? end - begin + 1 : ncores; ++ close(fd); ++ return; ++ } ++ } ++} ++ ++static void set_die(void) ++{ ++ int fd, begin, end; ++ char buf[20]; ++ cores_per_die = 0; ++ fd = open_sys_file(0, O_RDONLY, node_path); ++ ++ if (fd == -1) { ++ return; ++ } ++ ++ if (read(fd, buf, sizeof(buf))) { ++ if (sscanf(buf, "%d-%d", &begin, &end) == 2) { ++ cores_per_die = end > begin ? end - begin + 1 : 0; ++ } ++ } ++ ++ close(fd); ++} ++ ++static void init_cpu_info(unsigned int cpus) ++{ ++ cpu_nums = cpus; ++ cpu_infos = calloc(sizeof(*cpu_infos), cpus); ++ ++ if (!cpu_infos) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for cpu infos in %s.\n", __func__); ++ return; ++ } ++ ++ for (unsigned int i = 0; i < cpus; ++i) { ++ cpu_infos[i].state = CPU_ONLINE; ++ cpu_infos[i].ce_queue = init_queue(); ++ } ++ /* set limit of offlined cpu limit according to number of cpu */ ++ cpu_limit.limit = cpus / 3; ++ set_socket(); ++ set_die(); ++} ++ ++static void check_config(struct isolation_param *config) ++{ ++ if (config->value >= config->limit) { ++ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", ++ config->value, config->limit); ++ config->value = config->limit; ++ } ++ ++ return; ++} ++ ++static int parse_ul_config(struct isolation_param *config, char* env, unsigned long* value) ++{ ++ int env_size, has_unit = 0; ++ ++ if (!env || strlen(env) == 0) { ++ return -1; ++ } ++ ++ env_size = strlen(env); ++ char* unit = NULL; ++ unit = env + env_size - 1; ++ ++ if (isalpha(*unit)) { ++ has_unit = 1; ++ env_size--; ++ } ++ ++ for (int i = 0; i < env_size; ++i) { ++ if (isdigit(env[i])) { ++ if (*value > ULONG_MAX / 10 || (*value == ULONG_MAX / 10 && env[i] - '0' > 5)) { ++ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = 10 * (*value) + (env[i] - '0'); ++ } ++ else { ++ return -1; ++ } ++ } ++ ++ if (has_unit) { ++ for (const struct param *units = config->units; units->name; units++) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ if (*value > ULONG_MAX / units->value) { ++ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); ++ return -1; ++ } ++ *value = (*value) * units->value; ++ return 0; ++ } ++ } ++ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void init_config(struct isolation_param *config) ++{ ++ char* env = getenv(config->name); ++ unsigned long value = 0; ++ ++ if (parse_ul_config(config, env, &value) < 0) { ++ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %ld.\n", ++ config->name, env, config->value); ++ return; ++ } ++ ++ config->value = value; ++ check_config(config); ++} ++ ++void ras_error_count_init(unsigned int cpus) ++{ ++ init_cpu_info(cpus); ++ init_config(&threshold); ++ init_config(&cpu_limit); ++ init_config(&cycle); ++} ++ ++void cpu_infos_free(void) ++{ ++ if (cpu_infos) { ++ for (int i = 0; i < cpu_nums; ++i) { ++ clear_queue(cpu_infos[i].ce_queue); ++ } ++ free(cpu_infos); ++ } ++ ++ return; ++} ++ ++static int get_cpu_status(unsigned cpu) ++{ ++ int fd; ++ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); ++ ++ if (fd == -1) { ++ return CPU_UNKNOWN; ++ } ++ ++ int num = 0; ++ ++ if (read(fd, &num, 1) <= 0) { ++ num = CPU_UNKNOWN; ++ } ++ else { ++ num = num - '0'; ++ } ++ ++ close(fd); ++ ++ return num; ++} ++ ++static int do_cpu_offline(unsigned cpu) ++{ ++ int fd, rc; ++ char buf[2]; ++ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; ++ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); ++ ++ if (fd == -1) { ++ return HANDLE_FAILED; ++ } ++ ++ strcpy(buf, "0"); ++ rc = write(fd, buf, strlen(buf)); ++ close(fd); ++ ++ if (rc < 0) { ++ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ return HANDLE_FAILED; ++ } ++ /* check wthether the cpu is isolated successfully */ ++ else { ++ int num = 0; ++ num = get_cpu_status(cpu); ++ if (num == CPU_OFFLINE) { ++ return HANDLE_SUCCEED; ++ } ++ else { ++ return HANDLE_FAILED; ++ } ++ ++ } ++} ++ ++static void do_ce_handler(unsigned cpu, int *ret) ++{ ++ struct link_queue *queue = cpu_infos[cpu].ce_queue; ++ ++ if (queue == NULL) { ++ return; ++ } ++ ++ unsigned tmp; ++ /* ++ * Since we just count all error numbers in setted cycle, we store the time ++ * and error numbers from current event to the queue, then everytime we ++ * calculate the period from beginning time to ending time, if the period ++ * exceeds setted cycle, we pop the beginning time and error until the period ++ * from new beginning time to ending time is less than cycle. ++ */ ++ while (queue && queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { ++ tmp = queue->head->value; ++ if (pop(queue) == 0) { ++ cpu_infos[cpu].ce_nums -= tmp; ++ } ++ } ++ ++ if (cpu_infos[cpu].ce_nums >= threshold.value) { ++ log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", ++ threshold.value, cpu); ++ *ret = do_cpu_offline(cpu); ++ } ++} ++ ++static void do_uce_handler(unsigned cpu, int *ret) ++{ ++ if (cpu_infos[cpu].uce_nums > 0) { ++ log(TERM, LOG_INFO, "Uncorrected Errors occured, try to offline cpu%d\n", cpu); ++ *ret = do_cpu_offline(cpu); ++ } ++} ++ ++static void error_handler(unsigned cpu, struct error_info *err_info, int *ret) ++{ ++ switch (err_info->err_type) ++ { ++ case CE: ++ do_ce_handler(cpu, ret); ++ break; ++ case UCE: ++ do_uce_handler(cpu, ret); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void record_error_info(unsigned cpu, struct error_info *err_info) ++{ ++ switch (err_info->err_type) ++ { ++ case CE: ++ { ++ struct queue_node *node = NULL; ++ node = node_create(err_info->time, err_info->nums); ++ /* if the queue is still NULL, try malloc again */ ++ if (cpu_infos[cpu].ce_queue == NULL) { ++ cpu_infos[cpu].ce_queue = init_queue(); ++ } ++ if (push(cpu_infos[cpu].ce_queue, node) < 0) { ++ /* when the queue is NULL and node is not NULL, free it */ ++ if (node != NULL) { ++ free(node); ++ } ++ log(TERM, LOG_ERR, "Fail to push node to queue\n"); ++ return; ++ } ++ cpu_infos[cpu].ce_nums += err_info->nums; ++ break; ++ } ++ case UCE: ++ cpu_infos[cpu].uce_nums++; ++ break; ++ default: ++ break; ++ } ++} ++ ++static unsigned long get_bit_value(int64_t value, int offset, int size) ++{ ++ if (size <= 0 || offset < 0) { ++ return 0; ++ } ++ ++ value >>= offset; ++ unsigned long res = 0; ++ int i = 0; ++ ++ while (i < size) { ++ res |= (value & (0x1 << (i++))); ++ } ++ ++ return res; ++} ++ ++static unsigned get_cpu_index(int64_t mpidr) ++{ ++ unsigned core_id, socket_id, die_id, cpu; ++ core_id = get_bit_value(mpidr, 8, 8); ++ socket_id = get_bit_value(mpidr, 21, 2); ++ die_id = get_bit_value(mpidr, 19, 2); ++ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die; ++ ++ return cpu; ++} ++ ++void ras_record_cpu_error(struct error_info* err_info, int64_t mpidr) ++{ ++ int cur_cpu_status; ++ unsigned cpu; ++ int ret = HANDLE_NOTHING; ++ ++ if (!cpu_infos) { ++ return; ++ } ++ ++ cpu = get_cpu_index(mpidr); ++ ++ if (cpu >= cpu_nums) { ++ return; ++ } ++ ++ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); ++ ++ if (cpu_infos[cpu].state == CPU_OFFLINE) { ++ /* user may online the offlined cpu */ ++ cur_cpu_status = get_cpu_status(cpu); ++ if (cur_cpu_status != CPU_ONLINE) { ++ log(TERM, LOG_INFO, "cpu%d is already offlined, ignore\n", cpu); ++ return; ++ } ++ cpu_infos[cpu].state = CPU_ONLINE; ++ } ++ ++ record_error_info(cpu, err_info); ++ /* Since user may change cpu state, we get current offlined cpu numbers every recording time. */ ++ if (cpu_nums - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { ++ log(TERM, LOG_WARNING, "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", ++ cpu_limit.value); ++ return; ++ } ++ ++ error_handler(cpu, err_info, &ret); ++ /* do nothing */ ++ if (ret == HANDLE_NOTHING) { ++ return; ++ } ++ ++ if (ret == HANDLE_SUCCEED) { ++ cpu_infos[cpu].state = CPU_OFFLINE; ++ struct link_queue *tmp = cpu_infos[cpu].ce_queue; ++ clear_queue(tmp); ++ cpu_infos[cpu].ce_queue = init_queue(); ++ } ++ ++ log(TERM, LOG_INFO, "Result of offlining cpu %d: %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); ++ ++ return; ++} +diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h +new file mode 100644 +index 0000000..f340585 +--- /dev/null ++++ b/ras-cpu-isolation.h +@@ -0,0 +1,70 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++*/ ++ ++#ifndef __RAS_CPU_ISOLATION_H ++#define __RAS_CPU_ISOLATION_H ++ ++#include "queue.h" ++ ++#define MAX_PATH_LEN 100 ++ ++struct param { ++ char *name; ++ unsigned long value; ++}; ++ ++struct isolation_param { ++ char *name; ++ const struct param *units; ++ unsigned long value; ++ unsigned long limit; ++}; ++ ++enum cpu_state { ++ CPU_OFFLINE, ++ CPU_ONLINE, ++ CPU_OFFLINE_FAILED, ++ CPU_UNKNOWN, ++}; ++ ++enum error_handle_result { ++ HANDLE_FAILED = -1, ++ HANDLE_SUCCEED, ++ HANDLE_NOTHING, ++}; ++ ++enum error_type { ++ CE = 1, ++ UCE ++}; ++ ++struct cpu_info { ++ unsigned long uce_nums; ++ unsigned long ce_nums; ++ struct link_queue *ce_queue; ++ enum cpu_state state; ++}; ++ ++struct error_info { ++ unsigned long nums; ++ time_t time; ++ enum error_type err_type; ++}; ++ ++ ++void ras_error_count_init(unsigned cpus); ++void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); ++void cpu_infos_free(void); ++ ++#endif +\ No newline at end of file +diff --git a/ras-events.c b/ras-events.c +index 471d25d..31c4170 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -40,6 +40,7 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" ++#include "ras-cpu-isolation.h" + + /* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never +@@ -874,6 +875,10 @@ int handle_ras_events(int record_events) + + cpus = get_num_cpus(ras); + ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ ras_error_count_init(cpus); ++#endif ++ + #ifdef HAVE_MCE + rc = register_mce_handler(ras, cpus); + if (rc) +@@ -990,6 +995,9 @@ err: + } + free(ras); + } ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ cpu_infos_free(); ++#endif + + return rc; + } +diff --git a/ras-record.h b/ras-record.h +index cc217a9..b453f83 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -77,6 +77,11 @@ struct ras_arm_event { + int64_t midr; + int32_t running_state; + int32_t psci_state; ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ const char *severity; ++ const uint8_t *error_info; ++ uint32_t length; ++#endif + }; + + struct devlink_event { +-- +2.27.0 + diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch new file mode 100644 index 0000000..3b306fe --- /dev/null +++ b/0007-add-trace-print-and-add-sqlite-store.patch @@ -0,0 +1,77 @@ +From 6192d18c563a0610142488d734a5993fe8dff077 Mon Sep 17 00:00:00 2001 +From: Lostwayzxc +Date: Fri, 21 May 2021 16:29:32 +0800 +Subject: [PATCH 2/2] add trace print of new information and add it to sqilte + +Since we add new information of the event, we add trace print and store it to +Sqlite. + +Signed-off-by: Luo Shengwei +--- + ras-arm-handler.c | 9 +++++++++ + ras-record.c | 8 ++++++++ + 2 files changed, 17 insertions(+) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 9f9302e..58b33f1 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -22,6 +22,12 @@ + #include "ras-report.h" + #include "ras-cpu-isolation.h" + ++static void trace_print_hex(struct trace_seq *s, const uint8_t *buf, int buf_len) ++{ ++ for (int i = 0; i < buf_len; ++i) { ++ trace_seq_printf(s, "%2.2x", buf[i]); ++ } ++} + + static int is_core_failure(unsigned long value) + { +@@ -135,6 +141,7 @@ int ras_arm_event_handler(struct trace_seq *s, + case GHES_SEV_PANIC: + ev.severity = "Fatal"; + } ++ trace_seq_printf(s, "\n severity: %s", ev.severity); + + if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { + int len, nums; +@@ -142,6 +149,8 @@ int ras_arm_event_handler(struct trace_seq *s, + if (!ev.error_info) + return -1; + ev.length = len; ++ trace_seq_printf(s, "\n processor_err_info: "); ++ trace_print_hex(s, ev.error_info, len); + /* relate to enum error_type */ + nums = count_errors(event, ev.error_info, len); + if (nums > 0) { +diff --git a/ras-record.c b/ras-record.c +index 549c494..33d4741 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -210,6 +210,10 @@ static const struct db_fields arm_event_fields[] = { + { .name="mpidr", .type="INTEGER" }, + { .name="running_state", .type="INTEGER" }, + { .name="psci_state", .type="INTEGER" }, ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ { .name="severity", .type="TEXT" }, ++ { .name="error_info", .type="BLOB" }, ++#endif + }; + + static const struct db_table_descriptor arm_event_tab = { +@@ -233,6 +237,10 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) + sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr); + sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state); + sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state); ++#ifdef HAVE_CPU_FAULT_ISOLATION ++ sqlite3_bind_text (priv->stmt_arm_record, 7, ev->severity, -1, NULL); ++ sqlite3_bind_blob (priv->stmt_arm_record, 8, ev->error_info, ev->length, NULL); ++#endif + + rc = sqlite3_step(priv->stmt_arm_record); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +-- +2.27.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 3101223..39bc449 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.6 -Release: 3 +Release: 4 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -24,6 +24,8 @@ Patch2: bugfix-rasdaemon-wait-for-file-access.patch Patch3: bugfix-fix-fd-check.patch Patch4: bugfix-fix-disk-error-log-storm.patch Patch5: backport-rasdaemon-Fix-error-print.patch +Patch6: 0006-add-cpu-online-fault-isolation.patch +Patch7: 0007-add-trace-print-and-add-sqlite-store.patch %description The rasdaemon program is a daemon which monitors the platform @@ -42,7 +44,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-cpu-fault-isolation %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -70,6 +72,9 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Fri May 21 2021 luoshengwei - 0.6.6-4 +- add cpu online fault isolation + * Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-3 - Type:bugfix - ID:NA -- Gitee From 1a92ff3a199ced5508db03a2f1c70fd1a0439c43 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Sat, 22 May 2021 17:13:07 +0800 Subject: [PATCH 02/15] add cpu fault isolation --- 0006-add-cpu-online-fault-isolation.patch | 75 +++++++++++-------- ...add-trace-print-and-add-sqlite-store.patch | 4 +- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 44bae48..82db16f 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -1,6 +1,6 @@ -From fcdc2d2f99def2745c261a085f7aacc8ce106246 Mon Sep 17 00:00:00 2001 +From 8b0ac32994b284676e572f43ed1a9734fedc475d Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Fri, 21 May 2021 16:28:15 +0800 +Date: Sat, 22 May 2021 17:06:50 +0800 Subject: [PATCH 1/2] add cpu online fault isolation Add cpu online fault isolation, when CE/UCE occurs, we choose to offline @@ -10,16 +10,16 @@ Signed-off-by: Luo Shengwei --- .travis.yml | 2 +- Makefile.am | 6 +- - configure.ac | 11 ++ + configure.ac | 11 + misc/rasdaemon.env | 15 ++ - queue.c | 121 ++++++++++++ + queue.c | 121 +++++++++++ queue.h | 42 ++++ ras-arm-handler.c | 73 +++++++ - ras-cpu-isolation.c | 469 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 70 +++++++ + ras-cpu-isolation.c | 483 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 71 +++++++ ras-events.c | 8 + ras-record.h | 5 + - 11 files changed, 820 insertions(+), 2 deletions(-) + 11 files changed, 835 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c @@ -379,10 +379,10 @@ index 2f170e2..9f9302e 100644 ras_store_arm_record(ras, &ev); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..857aa45 +index 0000000..a3edd6d --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,469 @@ +@@ -0,0 +1,483 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -409,10 +409,11 @@ index 0000000..857aa45 +#include "ras-cpu-isolation.h" + +static struct cpu_info* cpu_infos = NULL; -+static unsigned int cpu_nums, cores_per_socket, cores_per_die; ++static unsigned int ncores, cpu_nums, cores_per_socket, cores_per_die; ++static unsigned int sockets = 1, dies = 1; +static const char* cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +static const char* core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; -+static const char* node_path = "/sys/devices/system/node/node%d/cpulist"; ++static const char* node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { + { "", 1 }, @@ -431,7 +432,7 @@ index 0000000..857aa45 + .name = "CPU_CE_THRESHOLD", + .units = normal_units, + .value = 18, -+ .limit = 100 ++ .limit = 50 +}; + +static struct isolation_param cpu_limit = { @@ -470,12 +471,11 @@ index 0000000..857aa45 + return fd; +} + -+static void set_socket(void) ++static void get_sockets(void) +{ -+ int ncores, fd; -+ int begin, end; -+ char buf[20]; -+ ncores = sysconf(_SC_NPROCESSORS_CONF); ++ int fd; ++ char buf[MAX_BUF_LEN] = ""; ++ char pre_buf[MAX_BUF_LEN] = ""; + cores_per_socket = ncores; + + for (int i = 0; i < ncores; ++i) { @@ -487,20 +487,26 @@ index 0000000..857aa45 + close(fd); + continue; + } -+ if (sscanf(buf, "%d-%d", &begin, &end) == 2) { -+ cores_per_socket = end > begin ? end - begin + 1 : ncores; -+ close(fd); -+ return; ++ if (strlen(pre_buf) > 0) { ++ if (strcmp(pre_buf, buf) != 0) { ++ sockets++; ++ strcpy(pre_buf, buf); ++ } + } ++ else { ++ strcpy(pre_buf, buf); ++ } ++ close(fd); + } ++ ++ cores_per_socket = ncores / sockets; +} + -+static void set_die(void) ++static void get_dies(void) +{ + int fd, begin, end; -+ char buf[20]; -+ cores_per_die = 0; -+ fd = open_sys_file(0, O_RDONLY, node_path); ++ char buf[20] = ""; ++ fd = open(node_path, O_RDONLY); + + if (fd == -1) { + return; @@ -508,15 +514,17 @@ index 0000000..857aa45 + + if (read(fd, buf, sizeof(buf))) { + if (sscanf(buf, "%d-%d", &begin, &end) == 2) { -+ cores_per_die = end > begin ? end - begin + 1 : 0; ++ dies = end > begin ? end - begin + 1 : 1; + } + } + + close(fd); ++ cores_per_die = ncores / dies; +} + +static void init_cpu_info(unsigned int cpus) +{ ++ ncores = sysconf(_SC_NPROCESSORS_CONF); + cpu_nums = cpus; + cpu_infos = calloc(sizeof(*cpu_infos), cpus); + @@ -531,8 +539,8 @@ index 0000000..857aa45 + } + /* set limit of offlined cpu limit according to number of cpu */ + cpu_limit.limit = cpus / 3; -+ set_socket(); -+ set_die(); ++ get_sockets(); ++ get_dies(); +} + +static void check_config(struct isolation_param *config) @@ -790,6 +798,12 @@ index 0000000..857aa45 +static unsigned get_cpu_index(int64_t mpidr) +{ + unsigned core_id, socket_id, die_id, cpu; ++ /* ++ * In the MPIDR: ++ * bit 8:15: core id ++ * bit 19:20: die_id ++ * bit 21:22: socket_id ++ */ + core_id = get_bit_value(mpidr, 8, 8); + socket_id = get_bit_value(mpidr, 21, 2); + die_id = get_bit_value(mpidr, 19, 2); @@ -854,10 +868,10 @@ index 0000000..857aa45 +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..f340585 +index 0000000..5ad06cf --- /dev/null +++ b/ras-cpu-isolation.h -@@ -0,0 +1,70 @@ +@@ -0,0 +1,71 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -878,6 +892,7 @@ index 0000000..f340585 +#include "queue.h" + +#define MAX_PATH_LEN 100 ++#define MAX_BUF_LEN 1024 + +struct param { + char *name; diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch index 3b306fe..8db631f 100644 --- a/0007-add-trace-print-and-add-sqlite-store.patch +++ b/0007-add-trace-print-and-add-sqlite-store.patch @@ -1,6 +1,6 @@ -From 6192d18c563a0610142488d734a5993fe8dff077 Mon Sep 17 00:00:00 2001 +From 921f9d9983c02e35d4fab148bceb55451f764965 Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Fri, 21 May 2021 16:29:32 +0800 +Date: Sat, 22 May 2021 17:07:22 +0800 Subject: [PATCH 2/2] add trace print of new information and add it to sqilte Since we add new information of the event, we add trace print and store it to -- Gitee From b837009f6f8dda9f3c44516f11476005816a85d3 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Mon, 24 May 2021 09:36:18 +0800 Subject: [PATCH 03/15] modify pointer format --- 0006-add-cpu-online-fault-isolation.patch | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 82db16f..0c64795 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -307,7 +307,7 @@ index 2f170e2..9f9302e 100644 + return (value & 0xf) && !(value & (0x1 << 2)); +} + -+static int count_errors(struct event_format *event, const uint8_t* data, int len) ++static int count_errors(struct event_format *event, const uint8_t *data, int len) +{ + /* + * According to UEFI_2_9_2021_03_18 specification chapter N2.4.4, @@ -408,12 +408,12 @@ index 0000000..a3edd6d +#include "ras-logger.h" +#include "ras-cpu-isolation.h" + -+static struct cpu_info* cpu_infos = NULL; ++static struct cpu_info *cpu_infos = NULL; +static unsigned int ncores, cpu_nums, cores_per_socket, cores_per_die; +static unsigned int sockets = 1, dies = 1; -+static const char* cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -+static const char* core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; -+static const char* node_path = "/sys/devices/system/node/possible"; ++static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; ++static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; ++static const char *node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { + { "", 1 }, @@ -554,7 +554,7 @@ index 0000000..a3edd6d + return; +} + -+static int parse_ul_config(struct isolation_param *config, char* env, unsigned long* value) ++static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) +{ + int env_size, has_unit = 0; + @@ -605,7 +605,7 @@ index 0000000..a3edd6d + +static void init_config(struct isolation_param *config) +{ -+ char* env = getenv(config->name); ++ char *env = getenv(config->name); + unsigned long value = 0; + + if (parse_ul_config(config, env, &value) < 0) { @@ -812,7 +812,7 @@ index 0000000..a3edd6d + return cpu; +} + -+void ras_record_cpu_error(struct error_info* err_info, int64_t mpidr) ++void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) +{ + int cur_cpu_status; + unsigned cpu; -- Gitee From 92f43b17051a11b61418b284c5d7b946c83f5c19 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Mon, 24 May 2021 22:49:40 +0800 Subject: [PATCH 04/15] fix bug and modify format according to review --- 0006-add-cpu-online-fault-isolation.patch | 153 ++++++++++-------- ...add-trace-print-and-add-sqlite-store.patch | 23 +-- 2 files changed, 94 insertions(+), 82 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 0c64795..f5cbe14 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -1,6 +1,6 @@ -From 8b0ac32994b284676e572f43ed1a9734fedc475d Mon Sep 17 00:00:00 2001 +From 4597fe55384ff2ffceca071f203710a0a5e95f14 Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Sat, 22 May 2021 17:06:50 +0800 +Date: Mon, 24 May 2021 22:42:42 +0800 Subject: [PATCH 1/2] add cpu online fault isolation Add cpu online fault isolation, when CE/UCE occurs, we choose to offline @@ -12,14 +12,14 @@ Signed-off-by: Luo Shengwei Makefile.am | 6 +- configure.ac | 11 + misc/rasdaemon.env | 15 ++ - queue.c | 121 +++++++++++ + queue.c | 120 +++++++++++ queue.h | 42 ++++ - ras-arm-handler.c | 73 +++++++ - ras-cpu-isolation.c | 483 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 71 +++++++ + ras-arm-handler.c | 74 +++++++ + ras-cpu-isolation.c | 491 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 74 +++++++ ras-events.c | 8 + ras-record.h | 5 + - 11 files changed, 835 insertions(+), 2 deletions(-) + 11 files changed, 846 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c @@ -112,10 +112,10 @@ index 12fd766..7498992 100644 +CPU_ISOLATION_LIMIT="10" diff --git a/queue.c b/queue.c new file mode 100644 -index 0000000..1ef8688 +index 0000000..65ca5f9 --- /dev/null +++ b/queue.c -@@ -0,0 +1,121 @@ +@@ -0,0 +1,120 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -201,12 +201,11 @@ index 0000000..1ef8688 + +int pop(struct link_queue *queue) +{ -+ struct queue_node *tmp = NULL; -+ -+ if (is_empty(queue)) { ++ if (queue == NULL || is_empty(queue)) { + return -1; + } + ++ struct queue_node *tmp = NULL; + tmp = queue->head; + queue->head = queue->head->next; + free(tmp); @@ -217,7 +216,7 @@ index 0000000..1ef8688 + +struct queue_node* front(struct link_queue *queue) +{ -+ if (is_empty(queue)) { ++ if (queue == NULL) { + return NULL; + } + @@ -230,7 +229,7 @@ index 0000000..1ef8688 + node = (struct queue_node*) malloc(sizeof(struct queue_node)); + + if (node != NULL) { -+ node->time = time; ++ node->time = time; + node->value = value; + node->next = NULL; + } @@ -287,16 +286,16 @@ index 0000000..b60aa81 +#endif \ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 2f170e2..9f9302e 100644 +index 2f170e2..23f97cd 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -20,6 +20,44 @@ +@@ -20,6 +20,45 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" +#include "ras-cpu-isolation.h" + -+ ++#ifdef HAVE_CPU_FAULT_ISOLATION +static int is_core_failure(unsigned long value) +{ + /* @@ -332,10 +331,11 @@ index 2f170e2..9f9302e 100644 + } + return num; +} ++#endif int ras_arm_event_handler(struct trace_seq *s, struct pevent_record *record, -@@ -78,6 +116,41 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -78,6 +117,41 @@ int ras_arm_event_handler(struct trace_seq *s, ev.psci_state = val; trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); @@ -379,10 +379,10 @@ index 2f170e2..9f9302e 100644 ras_store_arm_record(ras, &ev); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..a3edd6d +index 0000000..aa24f4f --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,483 @@ +@@ -0,0 +1,491 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -410,28 +410,28 @@ index 0000000..a3edd6d + +static struct cpu_info *cpu_infos = NULL; +static unsigned int ncores, cpu_nums, cores_per_socket, cores_per_die; -+static unsigned int sockets = 1, dies = 1; ++static unsigned int sockets, dies = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; +static const char *node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { -+ { "", 1 }, -+ {} ++ { "", 1 }, ++ {} +}; + +static const struct param cycle_units[] = { -+ { "d", 24 * 60 * 60 }, -+ { "h", 60 * 60 }, -+ { "m", 60 }, -+ { "s", 1 }, -+ {} ++ { "d", 24 * 60 * 60 }, ++ { "h", 60 * 60 }, ++ { "m", 60 }, ++ { "s", 1 }, ++ {} +}; + +static struct isolation_param threshold = { -+ .name = "CPU_CE_THRESHOLD", ++ .name = "CPU_CE_THRESHOLD", + .units = normal_units, -+ .value = 18, ++ .value = 18, + .limit = 50 +}; + @@ -451,21 +451,21 @@ index 0000000..a3edd6d + +static const char *cpu_state[] = { + [CPU_OFFLINE] = "offline", -+ [CPU_ONLINE] = "online", -+ [CPU_OFFLINE_FAILED] = "offline-failed", ++ [CPU_ONLINE] = "online", ++ [CPU_OFFLINE_FAILED] = "offline-failed", + [CPU_UNKNOWN] = "unknown" +}; + +static int open_sys_file(unsigned cpu, int __oflag, const char *format) +{ + int fd; -+ char buf[MAX_PATH_LEN]; ++ char buf[MAX_PATH_LEN] = ""; + snprintf(buf, sizeof(buf), format, cpu); -+ fd = open(buf, __oflag); ++ fd = open(buf, __oflag); + -+ if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); -+ return -1; ++ if (fd == -1) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, buf); ++ return -1; + } + + return fd; @@ -473,32 +473,39 @@ index 0000000..a3edd6d + +static void get_sockets(void) +{ -+ int fd; ++ int fd, j; + char buf[MAX_BUF_LEN] = ""; -+ char pre_buf[MAX_BUF_LEN] = ""; + cores_per_socket = ncores; ++ struct cpu_set *cpu_sets = calloc(sizeof(*cpu_sets), ncores); ++ ++ if (!cpu_sets) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); ++ return; ++ } + + for (int i = 0; i < ncores; ++i) { + fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); + if (fd == -1) { + continue; + } ++ memset(buf, '\0', strlen(buf)); + if (read(fd, buf, sizeof(buf)) <= 0) { + close(fd); + continue; + } -+ if (strlen(pre_buf) > 0) { -+ if (strcmp(pre_buf, buf) != 0) { -+ sockets++; -+ strcpy(pre_buf, buf); ++ for (j = 0; j < sockets; ++j) { ++ if (strcmp(cpu_sets[j].buf, buf) == 0) { ++ break; + } + } -+ else { -+ strcpy(pre_buf, buf); ++ if (j == sockets) { ++ strcpy(cpu_sets[sockets].buf, buf); ++ sockets++; + } + close(fd); + } + ++ free(cpu_sets); + cores_per_socket = ncores / sockets; +} + @@ -506,6 +513,7 @@ index 0000000..a3edd6d +{ + int fd, begin, end; + char buf[20] = ""; ++ cores_per_die = ncores; + fd = open(node_path, O_RDONLY); + + if (fd == -1) { @@ -563,7 +571,7 @@ index 0000000..a3edd6d + } + + env_size = strlen(env); -+ char* unit = NULL; ++ char *unit = NULL; + unit = env + env_size - 1; + + if (isalpha(*unit)) { @@ -586,16 +594,16 @@ index 0000000..a3edd6d + + if (has_unit) { + for (const struct param *units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ if (*value > ULONG_MAX / units->value) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ if (*value > ULONG_MAX / units->value) { + log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); + return -1; + } + *value = (*value) * units->value; + return 0; -+ } -+ } ++ } ++ } + log(TERM, LOG_ERR, "Invalid unit %s\n", unit); + return -1; + } @@ -618,7 +626,7 @@ index 0000000..a3edd6d + check_config(config); +} + -+void ras_error_count_init(unsigned int cpus) ++void ras_error_count_init(unsigned cpus) +{ + init_cpu_info(cpus); + init_config(&threshold); @@ -666,20 +674,20 @@ index 0000000..a3edd6d + int fd, rc; + char buf[2]; + cpu_infos[cpu].state = CPU_OFFLINE_FAILED; -+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); ++ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); + + if (fd == -1) { + return HANDLE_FAILED; + } + + strcpy(buf, "0"); -+ rc = write(fd, buf, strlen(buf)); ++ rc = write(fd, buf, strlen(buf)); + close(fd); + + if (rc < 0) { -+ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); + return HANDLE_FAILED; -+ } ++ } + /* check wthether the cpu is isolated successfully */ + else { + int num = 0; @@ -720,7 +728,7 @@ index 0000000..a3edd6d + if (cpu_infos[cpu].ce_nums >= threshold.value) { + log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", + threshold.value, cpu); -+ *ret = do_cpu_offline(cpu); ++ *ret = do_cpu_offline(cpu); + } +} + @@ -780,19 +788,19 @@ index 0000000..a3edd6d + +static unsigned long get_bit_value(int64_t value, int offset, int size) +{ -+ if (size <= 0 || offset < 0) { -+ return 0; -+ } ++ if (size <= 0 || offset < 0) { ++ return 0; ++ } + -+ value >>= offset; -+ unsigned long res = 0; -+ int i = 0; ++ value >>= offset; ++ unsigned long res = 0; ++ int i = 0; + -+ while (i < size) { -+ res |= (value & (0x1 << (i++))); -+ } ++ while (i < size) { ++ res |= (value & (0x1 << (i++))); ++ } + -+ return res; ++ return res; +} + +static unsigned get_cpu_index(int64_t mpidr) @@ -809,7 +817,7 @@ index 0000000..a3edd6d + die_id = get_bit_value(mpidr, 19, 2); + cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die; + -+ return cpu; ++ return cpu; +} + +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) @@ -862,16 +870,16 @@ index 0000000..a3edd6d + } + + log(TERM, LOG_INFO, "Result of offlining cpu %d: %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); ++ cpu, cpu_state[cpu_infos[cpu].state]); + + return; +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..5ad06cf +index 0000000..c8dec50 --- /dev/null +++ b/ras-cpu-isolation.h -@@ -0,0 +1,71 @@ +@@ -0,0 +1,74 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -937,6 +945,9 @@ index 0000000..5ad06cf + enum error_type err_type; +}; + ++struct cpu_set { ++ char buf[MAX_BUF_LEN]; ++}; + +void ras_error_count_init(unsigned cpus); +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch index 8db631f..51f6f01 100644 --- a/0007-add-trace-print-and-add-sqlite-store.patch +++ b/0007-add-trace-print-and-add-sqlite-store.patch @@ -1,6 +1,6 @@ -From 921f9d9983c02e35d4fab148bceb55451f764965 Mon Sep 17 00:00:00 2001 +From afabf09e8888dde365ee1b9d7af9c29524073e6a Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Sat, 22 May 2021 17:07:22 +0800 +Date: Mon, 24 May 2021 22:43:10 +0800 Subject: [PATCH 2/2] add trace print of new information and add it to sqilte Since we add new information of the event, we add trace print and store it to @@ -8,28 +8,29 @@ Sqlite. Signed-off-by: Luo Shengwei --- - ras-arm-handler.c | 9 +++++++++ - ras-record.c | 8 ++++++++ - 2 files changed, 17 insertions(+) + ras-arm-handler.c | 10 ++++++++++ + ras-record.c | 8 ++++++++ + 2 files changed, 18 insertions(+) diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 9f9302e..58b33f1 100644 +index 23f97cd..ef6d88d 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -22,6 +22,12 @@ - #include "ras-report.h" +@@ -23,6 +23,13 @@ #include "ras-cpu-isolation.h" + #ifdef HAVE_CPU_FAULT_ISOLATION +static void trace_print_hex(struct trace_seq *s, const uint8_t *buf, int buf_len) +{ + for (int i = 0; i < buf_len; ++i) { + trace_seq_printf(s, "%2.2x", buf[i]); + } +} - ++ static int is_core_failure(unsigned long value) { -@@ -135,6 +141,7 @@ int ras_arm_event_handler(struct trace_seq *s, + /* +@@ -136,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s, case GHES_SEV_PANIC: ev.severity = "Fatal"; } @@ -37,7 +38,7 @@ index 9f9302e..58b33f1 100644 if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { int len, nums; -@@ -142,6 +149,8 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -143,6 +151,8 @@ int ras_arm_event_handler(struct trace_seq *s, if (!ev.error_info) return -1; ev.length = len; -- Gitee From a1d2bd8e3baa34019c1e0e33820aafbfed4ec93d Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 09:22:08 +0800 Subject: [PATCH 05/15] fix divide by zero bug in [get_sockets] function --- 0006-add-cpu-online-fault-isolation.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index f5cbe14..07c2a16 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -475,7 +475,7 @@ index 0000000..aa24f4f +{ + int fd, j; + char buf[MAX_BUF_LEN] = ""; -+ cores_per_socket = ncores; ++ + struct cpu_set *cpu_sets = calloc(sizeof(*cpu_sets), ncores); + + if (!cpu_sets) { @@ -506,7 +506,7 @@ index 0000000..aa24f4f + } + + free(cpu_sets); -+ cores_per_socket = ncores / sockets; ++ cores_per_socket = (sockets > 0 ? ncores / sockets : ncores); +} + +static void get_dies(void) -- Gitee From bcc6d148f1fce16fc7abebd06c0e99804792ee07 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 09:23:48 +0800 Subject: [PATCH 06/15] fix --- 0006-add-cpu-online-fault-isolation.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 07c2a16..46c3044 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -475,7 +475,7 @@ index 0000000..aa24f4f +{ + int fd, j; + char buf[MAX_BUF_LEN] = ""; -+ ++ cores_per_socket = ncores; + struct cpu_set *cpu_sets = calloc(sizeof(*cpu_sets), ncores); + + if (!cpu_sets) { -- Gitee From 12cf828e889e9eaa8ecb7c2393498b69728be3b9 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 09:22:08 +0800 Subject: [PATCH 07/15] fix divide by zero bug in [get_sockets] function fix --- 0006-add-cpu-online-fault-isolation.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index f5cbe14..46c3044 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -506,7 +506,7 @@ index 0000000..aa24f4f + } + + free(cpu_sets); -+ cores_per_socket = ncores / sockets; ++ cores_per_socket = (sockets > 0 ? ncores / sockets : ncores); +} + +static void get_dies(void) -- Gitee From f281d33cd06141665fcb3363a4400d5c019f9aa6 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 15:42:09 +0800 Subject: [PATCH 08/15] modify according to review --- 0006-add-cpu-online-fault-isolation.patch | 344 +++++++++--------- ...add-trace-print-and-add-sqlite-store.patch | 10 +- 2 files changed, 187 insertions(+), 167 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 46c3044..518ea3e 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -1,6 +1,6 @@ -From 4597fe55384ff2ffceca071f203710a0a5e95f14 Mon Sep 17 00:00:00 2001 +From 73853ffd21b312176b30ea95b77b0643cfd5004b Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Mon, 24 May 2021 22:42:42 +0800 +Date: Tue, 25 May 2021 15:38:35 +0800 Subject: [PATCH 1/2] add cpu online fault isolation Add cpu online fault isolation, when CE/UCE occurs, we choose to offline @@ -11,15 +11,15 @@ Signed-off-by: Luo Shengwei .travis.yml | 2 +- Makefile.am | 6 +- configure.ac | 11 + - misc/rasdaemon.env | 15 ++ - queue.c | 120 +++++++++++ - queue.h | 42 ++++ - ras-arm-handler.c | 74 +++++++ - ras-cpu-isolation.c | 491 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 74 +++++++ - ras-events.c | 8 + + misc/rasdaemon.env | 16 ++ + queue.c | 126 +++++++++++ + queue.h | 43 ++++ + ras-arm-handler.c | 72 +++++++ + ras-cpu-isolation.c | 501 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 76 +++++++ + ras-events.c | 10 + ras-record.h | 5 + - 11 files changed, 846 insertions(+), 2 deletions(-) + 11 files changed, 866 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c @@ -88,16 +88,17 @@ index 2d6c59c..a682bb9 100644 + CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..7498992 100644 +index 12fd766..ad6a96f 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -27,3 +27,18 @@ PAGE_CE_THRESHOLD="50" +@@ -27,3 +27,19 @@ PAGE_CE_THRESHOLD="50" # soft-then-hard First try to soft offline, then try hard offlining. # Note: default offline choice is "soft". PAGE_CE_ACTION="soft" + +# CPU Online Fault Isolation +# Specify the threshold of corrected errors. ++CPU_ISOLATION_ENABLE="yes" +# +# Format: +# [0-9]+[unit] @@ -112,10 +113,10 @@ index 12fd766..7498992 100644 +CPU_ISOLATION_LIMIT="10" diff --git a/queue.c b/queue.c new file mode 100644 -index 0000000..65ca5f9 +index 0000000..92f3d3c --- /dev/null +++ b/queue.c -@@ -0,0 +1,120 @@ +@@ -0,0 +1,126 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -176,14 +177,22 @@ index 0000000..65ca5f9 + free(tmp); + } + -+ free(queue); ++ queue->head = NULL; ++ queue->tail = NULL; ++ queue->size = 0; +} + -+int push(struct link_queue *queue, struct queue_node *node) -+{ -+ if (!queue || !node) { -+ return -1; ++void free_queue(struct link_queue *queue) { ++ clear_queue(queue); ++ ++ if (queue) { ++ free(queue); + } ++} ++ ++/* It should be guranteed that the param is not NULL */ ++void push(struct link_queue *queue, struct queue_node *node) ++{ + /* there is no element in the queue */ + if (queue->head == NULL) { + queue->head = node; @@ -195,8 +204,6 @@ index 0000000..65ca5f9 + + queue->tail = node; + (queue->size)++; -+ -+ return 0; +} + +int pop(struct link_queue *queue) @@ -238,10 +245,10 @@ index 0000000..65ca5f9 +} diff --git a/queue.h b/queue.h new file mode 100644 -index 0000000..b60aa81 +index 0000000..9684c58 --- /dev/null +++ b/queue.h -@@ -0,0 +1,42 @@ +@@ -0,0 +1,43 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -277,7 +284,8 @@ index 0000000..b60aa81 +int is_empty(struct link_queue *queue); +struct link_queue* init_queue(void); +void clear_queue(struct link_queue *queue); -+int push(struct link_queue *queue, struct queue_node *node); ++void free_queue(struct link_queue *queue); ++void push(struct link_queue *queue, struct queue_node *node); +int pop(struct link_queue *queue); +struct queue_node* front(struct link_queue *queue); +struct queue_node* node_create(time_t time, unsigned value); @@ -286,10 +294,10 @@ index 0000000..b60aa81 +#endif \ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 2f170e2..23f97cd 100644 +index 2f170e2..8a0c7af 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -20,6 +20,45 @@ +@@ -20,6 +20,43 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" @@ -313,21 +321,19 @@ index 2f170e2..23f97cd 100644 + * the length of struct processor error information is 32, the byte + * length of the Flags field is 1, and the byte offset is 7 in the struct. + */ -+ int pei_err_size = 32; -+ int field_size = 1; + int cur_offset = 7; + unsigned long value; + int num = 0; -+ if (len % pei_err_size != 0) { ++ if (len % PEI_ERR_SIZE != 0) { + log(TERM, LOG_ERR, "the event data does not match to the ARM Processor Error Information Structure\n"); + return num; + } + while (cur_offset < len) { -+ value = pevent_read_number(event->pevent, data+cur_offset, field_size); ++ value = pevent_read_number(event->pevent, data+cur_offset, FLAGS_SIZE); + if (is_core_failure(value)) { + num++; + } -+ cur_offset += pei_err_size; ++ cur_offset += PEI_ERR_SIZE; + } + return num; +} @@ -335,7 +341,7 @@ index 2f170e2..23f97cd 100644 int ras_arm_event_handler(struct trace_seq *s, struct pevent_record *record, -@@ -78,6 +117,41 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -78,6 +115,41 @@ int ras_arm_event_handler(struct trace_seq *s, ev.psci_state = val; trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); @@ -379,10 +385,10 @@ index 2f170e2..23f97cd 100644 ras_store_arm_record(ras, &ev); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..aa24f4f +index 0000000..8be4c08 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,491 @@ +@@ -0,0 +1,501 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -409,22 +415,22 @@ index 0000000..aa24f4f +#include "ras-cpu-isolation.h" + +static struct cpu_info *cpu_infos = NULL; -+static unsigned int ncores, cpu_nums, cores_per_socket, cores_per_die; ++static unsigned int ncores, cores_per_socket, cores_per_die; +static unsigned int sockets, dies = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; +static const char *node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { -+ { "", 1 }, ++ { "", 1 }, + {} +}; + +static const struct param cycle_units[] = { -+ { "d", 24 * 60 * 60 }, -+ { "h", 60 * 60 }, -+ { "m", 60 }, -+ { "s", 1 }, ++ { "d", 24 * 60 * 60 }, ++ { "h", 60 * 60 }, ++ { "m", 60 }, ++ { "s", 1 }, + {} +}; + @@ -437,9 +443,7 @@ index 0000000..aa24f4f + +static struct isolation_param cpu_limit = { + .name = "CPU_ISOLATION_LIMIT", -+ .units = normal_units, -+ .value = 10, -+ .limit = 30 ++ .units = normal_units +}; + +static struct isolation_param cycle = { @@ -471,16 +475,16 @@ index 0000000..aa24f4f + return fd; +} + -+static void get_sockets(void) ++static int get_sockets(void) +{ + int fd, j; + char buf[MAX_BUF_LEN] = ""; + cores_per_socket = ncores; -+ struct cpu_set *cpu_sets = calloc(sizeof(*cpu_sets), ncores); ++ struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); + + if (!cpu_sets) { + log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); -+ return; ++ return -1; + } + + for (int i = 0; i < ncores; ++i) { @@ -506,10 +510,12 @@ index 0000000..aa24f4f + } + + free(cpu_sets); -+ cores_per_socket = (sockets > 0 ? ncores / sockets : ncores); ++ cores_per_socket = sockets > 0 ? ncores / sockets : ncores; ++ ++ return 0; +} + -+static void get_dies(void) ++static int get_dies(void) +{ + int fd, begin, end; + char buf[20] = ""; @@ -517,7 +523,7 @@ index 0000000..aa24f4f + fd = open(node_path, O_RDONLY); + + if (fd == -1) { -+ return; ++ return -1; + } + + if (read(fd, buf, sizeof(buf))) { @@ -528,38 +534,70 @@ index 0000000..aa24f4f + + close(fd); + cores_per_die = ncores / dies; ++ ++ return 0; ++} ++ ++static int get_cpu_status(unsigned cpu) ++{ ++ int fd; ++ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); ++ ++ if (fd == -1) { ++ return CPU_UNKNOWN; ++ } ++ ++ int num; ++ ++ if (read(fd, &num, 1) <= 0) { ++ num = CPU_UNKNOWN; ++ } ++ else { ++ num = num - '0'; ++ } ++ ++ close(fd); ++ ++ return num; +} + -+static void init_cpu_info(unsigned int cpus) ++static int init_cpu_info(unsigned int cpus) +{ -+ ncores = sysconf(_SC_NPROCESSORS_CONF); -+ cpu_nums = cpus; -+ cpu_infos = calloc(sizeof(*cpu_infos), cpus); ++ ncores = cpus; ++ cpu_infos = (struct cpu_info *) malloc(sizeof(*cpu_infos) * cpus); + + if (!cpu_infos) { + log(TERM, LOG_ERR, "Failed to allocate memory for cpu infos in %s.\n", __func__); -+ return; ++ return -1; + } + + for (unsigned int i = 0; i < cpus; ++i) { -+ cpu_infos[i].state = CPU_ONLINE; ++ cpu_infos[i].state = get_cpu_status(i); + cpu_infos[i].ce_queue = init_queue(); ++ if (cpu_infos[i].ce_queue == NULL) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for cpu ce queue in %s.\n", __func__); ++ return -1; ++ } + } + /* set limit of offlined cpu limit according to number of cpu */ + cpu_limit.limit = cpus / 3; -+ get_sockets(); -+ get_dies(); ++ cpu_limit.value = cpus / 3; ++ ++ if (get_sockets() < 0 || get_dies() < 0) { ++ log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); ++ return -1; ++ } ++ ++ return 0; +} + +static void check_config(struct isolation_param *config) +{ -+ if (config->value >= config->limit) { ++ if (config->value > config->limit) { + log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", + config->value, config->limit); + config->value = config->limit; + } -+ -+ return; +} + +static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) @@ -626,47 +664,39 @@ index 0000000..aa24f4f + check_config(config); +} + -+void ras_error_count_init(unsigned cpus) ++static int check_config_status(void) ++{ ++ char *env = getenv("CPU_ISOLATION_ENABLE"); ++ ++ if (env == NULL || strcasecmp(env, "yes")) { ++ log(TERM, LOG_ERR, "Config of cpu fault isolation is not on, exit\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++int ras_error_count_init(unsigned cpus) +{ -+ init_cpu_info(cpus); ++ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { ++ return -1; ++ } ++ + init_config(&threshold); + init_config(&cpu_limit); + init_config(&cycle); ++ ++ return 0; +} + +void cpu_infos_free(void) +{ + if (cpu_infos) { -+ for (int i = 0; i < cpu_nums; ++i) { -+ clear_queue(cpu_infos[i].ce_queue); ++ for (int i = 0; i < ncores; ++i) { ++ free_queue(cpu_infos[i].ce_queue); + } + free(cpu_infos); + } -+ -+ return; -+} -+ -+static int get_cpu_status(unsigned cpu) -+{ -+ int fd; -+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); -+ -+ if (fd == -1) { -+ return CPU_UNKNOWN; -+ } -+ -+ int num = 0; -+ -+ if (read(fd, &num, 1) <= 0) { -+ num = CPU_UNKNOWN; -+ } -+ else { -+ num = num - '0'; -+ } -+ -+ close(fd); -+ -+ return num; +} + +static int do_cpu_offline(unsigned cpu) @@ -682,34 +712,28 @@ index 0000000..aa24f4f + + strcpy(buf, "0"); + rc = write(fd, buf, strlen(buf)); -+ close(fd); + -+ if (rc < 0) { ++ if (rc < 0) { + log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ close(fd); + return HANDLE_FAILED; + } ++ ++ close(fd); + /* check wthether the cpu is isolated successfully */ ++ cpu_infos[cpu].state = get_cpu_status(cpu); ++ ++ if (cpu_infos[cpu].state == CPU_OFFLINE) { ++ return HANDLE_SUCCEED; ++ } + else { -+ int num = 0; -+ num = get_cpu_status(cpu); -+ if (num == CPU_OFFLINE) { -+ return HANDLE_SUCCEED; -+ } -+ else { -+ return HANDLE_FAILED; -+ } -+ ++ return HANDLE_FAILED; + } +} + -+static void do_ce_handler(unsigned cpu, int *ret) ++static int do_ce_handler(unsigned cpu) +{ + struct link_queue *queue = cpu_infos[cpu].ce_queue; -+ -+ if (queue == NULL) { -+ return; -+ } -+ + unsigned tmp; + /* + * Since we just count all error numbers in setted cycle, we store the time @@ -718,41 +742,47 @@ index 0000000..aa24f4f + * exceeds setted cycle, we pop the beginning time and error until the period + * from new beginning time to ending time is less than cycle. + */ -+ while (queue && queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { ++ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { + tmp = queue->head->value; + if (pop(queue) == 0) { + cpu_infos[cpu].ce_nums -= tmp; + } + } + -+ if (cpu_infos[cpu].ce_nums >= threshold.value) { -+ log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", ++ if (cpu_infos[cpu].ce_nums >= threshold.value) { ++ log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", + threshold.value, cpu); -+ *ret = do_cpu_offline(cpu); -+ } ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; +} + -+static void do_uce_handler(unsigned cpu, int *ret) ++static int do_uce_handler(unsigned cpu) +{ + if (cpu_infos[cpu].uce_nums > 0) { + log(TERM, LOG_INFO, "Uncorrected Errors occured, try to offline cpu%d\n", cpu); -+ *ret = do_cpu_offline(cpu); ++ return do_cpu_offline(cpu); + } ++ return HANDLE_NOTHING; +} + -+static void error_handler(unsigned cpu, struct error_info *err_info, int *ret) ++static int error_handler(unsigned cpu, struct error_info *err_info) +{ ++ int ret = HANDLE_NOTHING; ++ + switch (err_info->err_type) + { + case CE: -+ do_ce_handler(cpu, ret); ++ ret = do_ce_handler(cpu); + break; + case UCE: -+ do_uce_handler(cpu, ret); ++ ret = do_uce_handler(cpu); + break; + default: + break; + } ++ ++ return ret; +} + +static void record_error_info(unsigned cpu, struct error_info *err_info) @@ -763,18 +793,11 @@ index 0000000..aa24f4f + { + struct queue_node *node = NULL; + node = node_create(err_info->time, err_info->nums); -+ /* if the queue is still NULL, try malloc again */ -+ if (cpu_infos[cpu].ce_queue == NULL) { -+ cpu_infos[cpu].ce_queue = init_queue(); -+ } -+ if (push(cpu_infos[cpu].ce_queue, node) < 0) { -+ /* when the queue is NULL and node is not NULL, free it */ -+ if (node != NULL) { -+ free(node); -+ } -+ log(TERM, LOG_ERR, "Fail to push node to queue\n"); ++ if (node == NULL) { ++ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); + return; + } ++ push(cpu_infos[cpu].ce_queue, node); + cpu_infos[cpu].ce_nums += err_info->nums; + break; + } @@ -786,12 +809,8 @@ index 0000000..aa24f4f + } +} + -+static unsigned long get_bit_value(int64_t value, int offset, int size) ++static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) +{ -+ if (size <= 0 || offset < 0) { -+ return 0; -+ } -+ + value >>= offset; + unsigned long res = 0; + int i = 0; @@ -807,6 +826,7 @@ index 0000000..aa24f4f +{ + unsigned core_id, socket_id, die_id, cpu; + /* ++ * Adapt to certain BIOS + * In the MPIDR: + * bit 8:15: core id + * bit 19:20: die_id @@ -822,64 +842,60 @@ index 0000000..aa24f4f + +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) +{ -+ int cur_cpu_status; + unsigned cpu; -+ int ret = HANDLE_NOTHING; ++ int ret; + + if (!cpu_infos) { ++ log(TERM, LOG_ERR, "Since the cpu_infos which record cpu information is NULL, stop here\n"); + return; + } + + cpu = get_cpu_index(mpidr); + -+ if (cpu >= cpu_nums) { ++ if (cpu >= ncores) { ++ log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); + return; + } + + log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); ++ cpu_infos[cpu].state = get_cpu_status(cpu); + -+ if (cpu_infos[cpu].state == CPU_OFFLINE) { -+ /* user may online the offlined cpu */ -+ cur_cpu_status = get_cpu_status(cpu); -+ if (cur_cpu_status != CPU_ONLINE) { -+ log(TERM, LOG_INFO, "cpu%d is already offlined, ignore\n", cpu); -+ return; -+ } -+ cpu_infos[cpu].state = CPU_ONLINE; ++ if (cpu_infos[cpu].state != CPU_ONLINE) { ++ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); ++ return; + } + + record_error_info(cpu, err_info); + /* Since user may change cpu state, we get current offlined cpu numbers every recording time. */ -+ if (cpu_nums - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { ++ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { + log(TERM, LOG_WARNING, "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", + cpu_limit.value); + return; + } + -+ error_handler(cpu, err_info, &ret); -+ /* do nothing */ ++ ret = error_handler(cpu, err_info); ++ + if (ret == HANDLE_NOTHING) { -+ return; ++ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); + } -+ -+ if (ret == HANDLE_SUCCEED) { -+ cpu_infos[cpu].state = CPU_OFFLINE; -+ struct link_queue *tmp = cpu_infos[cpu].ce_queue; -+ clear_queue(tmp); -+ cpu_infos[cpu].ce_queue = init_queue(); ++ else if (ret == HANDLE_SUCCEED) { ++ log(TERM, LOG_INFO, "Offline cpu %d succeed, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); ++ clear_queue(cpu_infos[cpu].ce_queue); ++ } ++ else { ++ log(TERM, LOG_INFO, "Offline cpu %d fail, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); + } -+ -+ log(TERM, LOG_INFO, "Result of offlining cpu %d: %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); + + return; +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..c8dec50 +index 0000000..9f19d01 --- /dev/null +++ b/ras-cpu-isolation.h -@@ -0,0 +1,74 @@ +@@ -0,0 +1,76 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -901,6 +917,8 @@ index 0000000..c8dec50 + +#define MAX_PATH_LEN 100 +#define MAX_BUF_LEN 1024 ++#define PEI_ERR_SIZE 32 ++#define FLAGS_SIZE 1 + +struct param { + char *name; @@ -949,14 +967,14 @@ index 0000000..c8dec50 + char buf[MAX_BUF_LEN]; +}; + -+void ras_error_count_init(unsigned cpus); ++int ras_error_count_init(unsigned cpus); +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); +void cpu_infos_free(void); + +#endif \ No newline at end of file diff --git a/ras-events.c b/ras-events.c -index 471d25d..31c4170 100644 +index 471d25d..2609c7b 100644 --- a/ras-events.c +++ b/ras-events.c @@ -40,6 +40,7 @@ @@ -967,18 +985,20 @@ index 471d25d..31c4170 100644 /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -874,6 +875,10 @@ int handle_ras_events(int record_events) +@@ -874,6 +875,12 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); +#ifdef HAVE_CPU_FAULT_ISOLATION -+ ras_error_count_init(cpus); ++ if (ras_error_count_init(cpus) < 0) { ++ goto err; ++ } +#endif + #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -990,6 +995,9 @@ err: +@@ -990,6 +997,9 @@ err: } free(ras); } diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch index 51f6f01..cd415f3 100644 --- a/0007-add-trace-print-and-add-sqlite-store.patch +++ b/0007-add-trace-print-and-add-sqlite-store.patch @@ -1,6 +1,6 @@ -From afabf09e8888dde365ee1b9d7af9c29524073e6a Mon Sep 17 00:00:00 2001 +From 0afe0eb14aefc9ae544e1a0fe4d2559fd804aec0 Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Mon, 24 May 2021 22:43:10 +0800 +Date: Tue, 25 May 2021 15:39:05 +0800 Subject: [PATCH 2/2] add trace print of new information and add it to sqilte Since we add new information of the event, we add trace print and store it to @@ -13,7 +13,7 @@ Signed-off-by: Luo Shengwei 2 files changed, 18 insertions(+) diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 23f97cd..ef6d88d 100644 +index 8a0c7af..b4bcc6e 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -23,6 +23,13 @@ @@ -30,7 +30,7 @@ index 23f97cd..ef6d88d 100644 static int is_core_failure(unsigned long value) { /* -@@ -136,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -134,6 +141,7 @@ int ras_arm_event_handler(struct trace_seq *s, case GHES_SEV_PANIC: ev.severity = "Fatal"; } @@ -38,7 +38,7 @@ index 23f97cd..ef6d88d 100644 if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { int len, nums; -@@ -143,6 +151,8 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -141,6 +149,8 @@ int ras_arm_event_handler(struct trace_seq *s, if (!ev.error_info) return -1; ev.length = len; -- Gitee From 439e07927e2756433842064808157d6c2b1fb45a Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 15:49:58 +0800 Subject: [PATCH 09/15] modify according to review --- 0006-add-cpu-online-fault-isolation.patch | 344 ++++++++++++---------- 1 file changed, 182 insertions(+), 162 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 46c3044..518ea3e 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -1,6 +1,6 @@ -From 4597fe55384ff2ffceca071f203710a0a5e95f14 Mon Sep 17 00:00:00 2001 +From 73853ffd21b312176b30ea95b77b0643cfd5004b Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Mon, 24 May 2021 22:42:42 +0800 +Date: Tue, 25 May 2021 15:38:35 +0800 Subject: [PATCH 1/2] add cpu online fault isolation Add cpu online fault isolation, when CE/UCE occurs, we choose to offline @@ -11,15 +11,15 @@ Signed-off-by: Luo Shengwei .travis.yml | 2 +- Makefile.am | 6 +- configure.ac | 11 + - misc/rasdaemon.env | 15 ++ - queue.c | 120 +++++++++++ - queue.h | 42 ++++ - ras-arm-handler.c | 74 +++++++ - ras-cpu-isolation.c | 491 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 74 +++++++ - ras-events.c | 8 + + misc/rasdaemon.env | 16 ++ + queue.c | 126 +++++++++++ + queue.h | 43 ++++ + ras-arm-handler.c | 72 +++++++ + ras-cpu-isolation.c | 501 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.h | 76 +++++++ + ras-events.c | 10 + ras-record.h | 5 + - 11 files changed, 846 insertions(+), 2 deletions(-) + 11 files changed, 866 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c @@ -88,16 +88,17 @@ index 2d6c59c..a682bb9 100644 + CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..7498992 100644 +index 12fd766..ad6a96f 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -27,3 +27,18 @@ PAGE_CE_THRESHOLD="50" +@@ -27,3 +27,19 @@ PAGE_CE_THRESHOLD="50" # soft-then-hard First try to soft offline, then try hard offlining. # Note: default offline choice is "soft". PAGE_CE_ACTION="soft" + +# CPU Online Fault Isolation +# Specify the threshold of corrected errors. ++CPU_ISOLATION_ENABLE="yes" +# +# Format: +# [0-9]+[unit] @@ -112,10 +113,10 @@ index 12fd766..7498992 100644 +CPU_ISOLATION_LIMIT="10" diff --git a/queue.c b/queue.c new file mode 100644 -index 0000000..65ca5f9 +index 0000000..92f3d3c --- /dev/null +++ b/queue.c -@@ -0,0 +1,120 @@ +@@ -0,0 +1,126 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -176,14 +177,22 @@ index 0000000..65ca5f9 + free(tmp); + } + -+ free(queue); ++ queue->head = NULL; ++ queue->tail = NULL; ++ queue->size = 0; +} + -+int push(struct link_queue *queue, struct queue_node *node) -+{ -+ if (!queue || !node) { -+ return -1; ++void free_queue(struct link_queue *queue) { ++ clear_queue(queue); ++ ++ if (queue) { ++ free(queue); + } ++} ++ ++/* It should be guranteed that the param is not NULL */ ++void push(struct link_queue *queue, struct queue_node *node) ++{ + /* there is no element in the queue */ + if (queue->head == NULL) { + queue->head = node; @@ -195,8 +204,6 @@ index 0000000..65ca5f9 + + queue->tail = node; + (queue->size)++; -+ -+ return 0; +} + +int pop(struct link_queue *queue) @@ -238,10 +245,10 @@ index 0000000..65ca5f9 +} diff --git a/queue.h b/queue.h new file mode 100644 -index 0000000..b60aa81 +index 0000000..9684c58 --- /dev/null +++ b/queue.h -@@ -0,0 +1,42 @@ +@@ -0,0 +1,43 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -277,7 +284,8 @@ index 0000000..b60aa81 +int is_empty(struct link_queue *queue); +struct link_queue* init_queue(void); +void clear_queue(struct link_queue *queue); -+int push(struct link_queue *queue, struct queue_node *node); ++void free_queue(struct link_queue *queue); ++void push(struct link_queue *queue, struct queue_node *node); +int pop(struct link_queue *queue); +struct queue_node* front(struct link_queue *queue); +struct queue_node* node_create(time_t time, unsigned value); @@ -286,10 +294,10 @@ index 0000000..b60aa81 +#endif \ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 2f170e2..23f97cd 100644 +index 2f170e2..8a0c7af 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -20,6 +20,45 @@ +@@ -20,6 +20,43 @@ #include "ras-record.h" #include "ras-logger.h" #include "ras-report.h" @@ -313,21 +321,19 @@ index 2f170e2..23f97cd 100644 + * the length of struct processor error information is 32, the byte + * length of the Flags field is 1, and the byte offset is 7 in the struct. + */ -+ int pei_err_size = 32; -+ int field_size = 1; + int cur_offset = 7; + unsigned long value; + int num = 0; -+ if (len % pei_err_size != 0) { ++ if (len % PEI_ERR_SIZE != 0) { + log(TERM, LOG_ERR, "the event data does not match to the ARM Processor Error Information Structure\n"); + return num; + } + while (cur_offset < len) { -+ value = pevent_read_number(event->pevent, data+cur_offset, field_size); ++ value = pevent_read_number(event->pevent, data+cur_offset, FLAGS_SIZE); + if (is_core_failure(value)) { + num++; + } -+ cur_offset += pei_err_size; ++ cur_offset += PEI_ERR_SIZE; + } + return num; +} @@ -335,7 +341,7 @@ index 2f170e2..23f97cd 100644 int ras_arm_event_handler(struct trace_seq *s, struct pevent_record *record, -@@ -78,6 +117,41 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -78,6 +115,41 @@ int ras_arm_event_handler(struct trace_seq *s, ev.psci_state = val; trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); @@ -379,10 +385,10 @@ index 2f170e2..23f97cd 100644 ras_store_arm_record(ras, &ev); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..aa24f4f +index 0000000..8be4c08 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,491 @@ +@@ -0,0 +1,501 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -409,22 +415,22 @@ index 0000000..aa24f4f +#include "ras-cpu-isolation.h" + +static struct cpu_info *cpu_infos = NULL; -+static unsigned int ncores, cpu_nums, cores_per_socket, cores_per_die; ++static unsigned int ncores, cores_per_socket, cores_per_die; +static unsigned int sockets, dies = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; +static const char *node_path = "/sys/devices/system/node/possible"; + +static const struct param normal_units[] = { -+ { "", 1 }, ++ { "", 1 }, + {} +}; + +static const struct param cycle_units[] = { -+ { "d", 24 * 60 * 60 }, -+ { "h", 60 * 60 }, -+ { "m", 60 }, -+ { "s", 1 }, ++ { "d", 24 * 60 * 60 }, ++ { "h", 60 * 60 }, ++ { "m", 60 }, ++ { "s", 1 }, + {} +}; + @@ -437,9 +443,7 @@ index 0000000..aa24f4f + +static struct isolation_param cpu_limit = { + .name = "CPU_ISOLATION_LIMIT", -+ .units = normal_units, -+ .value = 10, -+ .limit = 30 ++ .units = normal_units +}; + +static struct isolation_param cycle = { @@ -471,16 +475,16 @@ index 0000000..aa24f4f + return fd; +} + -+static void get_sockets(void) ++static int get_sockets(void) +{ + int fd, j; + char buf[MAX_BUF_LEN] = ""; + cores_per_socket = ncores; -+ struct cpu_set *cpu_sets = calloc(sizeof(*cpu_sets), ncores); ++ struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); + + if (!cpu_sets) { + log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); -+ return; ++ return -1; + } + + for (int i = 0; i < ncores; ++i) { @@ -506,10 +510,12 @@ index 0000000..aa24f4f + } + + free(cpu_sets); -+ cores_per_socket = (sockets > 0 ? ncores / sockets : ncores); ++ cores_per_socket = sockets > 0 ? ncores / sockets : ncores; ++ ++ return 0; +} + -+static void get_dies(void) ++static int get_dies(void) +{ + int fd, begin, end; + char buf[20] = ""; @@ -517,7 +523,7 @@ index 0000000..aa24f4f + fd = open(node_path, O_RDONLY); + + if (fd == -1) { -+ return; ++ return -1; + } + + if (read(fd, buf, sizeof(buf))) { @@ -528,38 +534,70 @@ index 0000000..aa24f4f + + close(fd); + cores_per_die = ncores / dies; ++ ++ return 0; ++} ++ ++static int get_cpu_status(unsigned cpu) ++{ ++ int fd; ++ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); ++ ++ if (fd == -1) { ++ return CPU_UNKNOWN; ++ } ++ ++ int num; ++ ++ if (read(fd, &num, 1) <= 0) { ++ num = CPU_UNKNOWN; ++ } ++ else { ++ num = num - '0'; ++ } ++ ++ close(fd); ++ ++ return num; +} + -+static void init_cpu_info(unsigned int cpus) ++static int init_cpu_info(unsigned int cpus) +{ -+ ncores = sysconf(_SC_NPROCESSORS_CONF); -+ cpu_nums = cpus; -+ cpu_infos = calloc(sizeof(*cpu_infos), cpus); ++ ncores = cpus; ++ cpu_infos = (struct cpu_info *) malloc(sizeof(*cpu_infos) * cpus); + + if (!cpu_infos) { + log(TERM, LOG_ERR, "Failed to allocate memory for cpu infos in %s.\n", __func__); -+ return; ++ return -1; + } + + for (unsigned int i = 0; i < cpus; ++i) { -+ cpu_infos[i].state = CPU_ONLINE; ++ cpu_infos[i].state = get_cpu_status(i); + cpu_infos[i].ce_queue = init_queue(); ++ if (cpu_infos[i].ce_queue == NULL) { ++ log(TERM, LOG_ERR, "Failed to allocate memory for cpu ce queue in %s.\n", __func__); ++ return -1; ++ } + } + /* set limit of offlined cpu limit according to number of cpu */ + cpu_limit.limit = cpus / 3; -+ get_sockets(); -+ get_dies(); ++ cpu_limit.value = cpus / 3; ++ ++ if (get_sockets() < 0 || get_dies() < 0) { ++ log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); ++ return -1; ++ } ++ ++ return 0; +} + +static void check_config(struct isolation_param *config) +{ -+ if (config->value >= config->limit) { ++ if (config->value > config->limit) { + log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", + config->value, config->limit); + config->value = config->limit; + } -+ -+ return; +} + +static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) @@ -626,47 +664,39 @@ index 0000000..aa24f4f + check_config(config); +} + -+void ras_error_count_init(unsigned cpus) ++static int check_config_status(void) ++{ ++ char *env = getenv("CPU_ISOLATION_ENABLE"); ++ ++ if (env == NULL || strcasecmp(env, "yes")) { ++ log(TERM, LOG_ERR, "Config of cpu fault isolation is not on, exit\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++int ras_error_count_init(unsigned cpus) +{ -+ init_cpu_info(cpus); ++ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { ++ return -1; ++ } ++ + init_config(&threshold); + init_config(&cpu_limit); + init_config(&cycle); ++ ++ return 0; +} + +void cpu_infos_free(void) +{ + if (cpu_infos) { -+ for (int i = 0; i < cpu_nums; ++i) { -+ clear_queue(cpu_infos[i].ce_queue); ++ for (int i = 0; i < ncores; ++i) { ++ free_queue(cpu_infos[i].ce_queue); + } + free(cpu_infos); + } -+ -+ return; -+} -+ -+static int get_cpu_status(unsigned cpu) -+{ -+ int fd; -+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); -+ -+ if (fd == -1) { -+ return CPU_UNKNOWN; -+ } -+ -+ int num = 0; -+ -+ if (read(fd, &num, 1) <= 0) { -+ num = CPU_UNKNOWN; -+ } -+ else { -+ num = num - '0'; -+ } -+ -+ close(fd); -+ -+ return num; +} + +static int do_cpu_offline(unsigned cpu) @@ -682,34 +712,28 @@ index 0000000..aa24f4f + + strcpy(buf, "0"); + rc = write(fd, buf, strlen(buf)); -+ close(fd); + -+ if (rc < 0) { ++ if (rc < 0) { + log(TERM, LOG_ERR, "cpu%d offline failed, errno:%d\n", cpu, errno); ++ close(fd); + return HANDLE_FAILED; + } ++ ++ close(fd); + /* check wthether the cpu is isolated successfully */ ++ cpu_infos[cpu].state = get_cpu_status(cpu); ++ ++ if (cpu_infos[cpu].state == CPU_OFFLINE) { ++ return HANDLE_SUCCEED; ++ } + else { -+ int num = 0; -+ num = get_cpu_status(cpu); -+ if (num == CPU_OFFLINE) { -+ return HANDLE_SUCCEED; -+ } -+ else { -+ return HANDLE_FAILED; -+ } -+ ++ return HANDLE_FAILED; + } +} + -+static void do_ce_handler(unsigned cpu, int *ret) ++static int do_ce_handler(unsigned cpu) +{ + struct link_queue *queue = cpu_infos[cpu].ce_queue; -+ -+ if (queue == NULL) { -+ return; -+ } -+ + unsigned tmp; + /* + * Since we just count all error numbers in setted cycle, we store the time @@ -718,41 +742,47 @@ index 0000000..aa24f4f + * exceeds setted cycle, we pop the beginning time and error until the period + * from new beginning time to ending time is less than cycle. + */ -+ while (queue && queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { ++ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { + tmp = queue->head->value; + if (pop(queue) == 0) { + cpu_infos[cpu].ce_nums -= tmp; + } + } + -+ if (cpu_infos[cpu].ce_nums >= threshold.value) { -+ log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", ++ if (cpu_infos[cpu].ce_nums >= threshold.value) { ++ log(TERM, LOG_INFO, "Corrected Errors exceeded threshold %ld, try to offline cpu%d\n", + threshold.value, cpu); -+ *ret = do_cpu_offline(cpu); -+ } ++ return do_cpu_offline(cpu); ++ } ++ return HANDLE_NOTHING; +} + -+static void do_uce_handler(unsigned cpu, int *ret) ++static int do_uce_handler(unsigned cpu) +{ + if (cpu_infos[cpu].uce_nums > 0) { + log(TERM, LOG_INFO, "Uncorrected Errors occured, try to offline cpu%d\n", cpu); -+ *ret = do_cpu_offline(cpu); ++ return do_cpu_offline(cpu); + } ++ return HANDLE_NOTHING; +} + -+static void error_handler(unsigned cpu, struct error_info *err_info, int *ret) ++static int error_handler(unsigned cpu, struct error_info *err_info) +{ ++ int ret = HANDLE_NOTHING; ++ + switch (err_info->err_type) + { + case CE: -+ do_ce_handler(cpu, ret); ++ ret = do_ce_handler(cpu); + break; + case UCE: -+ do_uce_handler(cpu, ret); ++ ret = do_uce_handler(cpu); + break; + default: + break; + } ++ ++ return ret; +} + +static void record_error_info(unsigned cpu, struct error_info *err_info) @@ -763,18 +793,11 @@ index 0000000..aa24f4f + { + struct queue_node *node = NULL; + node = node_create(err_info->time, err_info->nums); -+ /* if the queue is still NULL, try malloc again */ -+ if (cpu_infos[cpu].ce_queue == NULL) { -+ cpu_infos[cpu].ce_queue = init_queue(); -+ } -+ if (push(cpu_infos[cpu].ce_queue, node) < 0) { -+ /* when the queue is NULL and node is not NULL, free it */ -+ if (node != NULL) { -+ free(node); -+ } -+ log(TERM, LOG_ERR, "Fail to push node to queue\n"); ++ if (node == NULL) { ++ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); + return; + } ++ push(cpu_infos[cpu].ce_queue, node); + cpu_infos[cpu].ce_nums += err_info->nums; + break; + } @@ -786,12 +809,8 @@ index 0000000..aa24f4f + } +} + -+static unsigned long get_bit_value(int64_t value, int offset, int size) ++static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) +{ -+ if (size <= 0 || offset < 0) { -+ return 0; -+ } -+ + value >>= offset; + unsigned long res = 0; + int i = 0; @@ -807,6 +826,7 @@ index 0000000..aa24f4f +{ + unsigned core_id, socket_id, die_id, cpu; + /* ++ * Adapt to certain BIOS + * In the MPIDR: + * bit 8:15: core id + * bit 19:20: die_id @@ -822,64 +842,60 @@ index 0000000..aa24f4f + +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) +{ -+ int cur_cpu_status; + unsigned cpu; -+ int ret = HANDLE_NOTHING; ++ int ret; + + if (!cpu_infos) { ++ log(TERM, LOG_ERR, "Since the cpu_infos which record cpu information is NULL, stop here\n"); + return; + } + + cpu = get_cpu_index(mpidr); + -+ if (cpu >= cpu_nums) { ++ if (cpu >= ncores) { ++ log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); + return; + } + + log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); ++ cpu_infos[cpu].state = get_cpu_status(cpu); + -+ if (cpu_infos[cpu].state == CPU_OFFLINE) { -+ /* user may online the offlined cpu */ -+ cur_cpu_status = get_cpu_status(cpu); -+ if (cur_cpu_status != CPU_ONLINE) { -+ log(TERM, LOG_INFO, "cpu%d is already offlined, ignore\n", cpu); -+ return; -+ } -+ cpu_infos[cpu].state = CPU_ONLINE; ++ if (cpu_infos[cpu].state != CPU_ONLINE) { ++ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); ++ return; + } + + record_error_info(cpu, err_info); + /* Since user may change cpu state, we get current offlined cpu numbers every recording time. */ -+ if (cpu_nums - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { ++ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { + log(TERM, LOG_WARNING, "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", + cpu_limit.value); + return; + } + -+ error_handler(cpu, err_info, &ret); -+ /* do nothing */ ++ ret = error_handler(cpu, err_info); ++ + if (ret == HANDLE_NOTHING) { -+ return; ++ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); + } -+ -+ if (ret == HANDLE_SUCCEED) { -+ cpu_infos[cpu].state = CPU_OFFLINE; -+ struct link_queue *tmp = cpu_infos[cpu].ce_queue; -+ clear_queue(tmp); -+ cpu_infos[cpu].ce_queue = init_queue(); ++ else if (ret == HANDLE_SUCCEED) { ++ log(TERM, LOG_INFO, "Offline cpu %d succeed, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); ++ clear_queue(cpu_infos[cpu].ce_queue); ++ } ++ else { ++ log(TERM, LOG_INFO, "Offline cpu %d fail, the state is %s\n", ++ cpu, cpu_state[cpu_infos[cpu].state]); + } -+ -+ log(TERM, LOG_INFO, "Result of offlining cpu %d: %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); + + return; +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..c8dec50 +index 0000000..9f19d01 --- /dev/null +++ b/ras-cpu-isolation.h -@@ -0,0 +1,74 @@ +@@ -0,0 +1,76 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -901,6 +917,8 @@ index 0000000..c8dec50 + +#define MAX_PATH_LEN 100 +#define MAX_BUF_LEN 1024 ++#define PEI_ERR_SIZE 32 ++#define FLAGS_SIZE 1 + +struct param { + char *name; @@ -949,14 +967,14 @@ index 0000000..c8dec50 + char buf[MAX_BUF_LEN]; +}; + -+void ras_error_count_init(unsigned cpus); ++int ras_error_count_init(unsigned cpus); +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); +void cpu_infos_free(void); + +#endif \ No newline at end of file diff --git a/ras-events.c b/ras-events.c -index 471d25d..31c4170 100644 +index 471d25d..2609c7b 100644 --- a/ras-events.c +++ b/ras-events.c @@ -40,6 +40,7 @@ @@ -967,18 +985,20 @@ index 471d25d..31c4170 100644 /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -874,6 +875,10 @@ int handle_ras_events(int record_events) +@@ -874,6 +875,12 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); +#ifdef HAVE_CPU_FAULT_ISOLATION -+ ras_error_count_init(cpus); ++ if (ras_error_count_init(cpus) < 0) { ++ goto err; ++ } +#endif + #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -990,6 +995,9 @@ err: +@@ -990,6 +997,9 @@ err: } free(ras); } -- Gitee From 25d9849af1f24b71c85f14107d26a22f9a307401 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 16:35:48 +0800 Subject: [PATCH 10/15] add decription to env config --- 0006-add-cpu-online-fault-isolation.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 518ea3e..5eafe53 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -97,7 +97,7 @@ index 12fd766..ad6a96f 100644 PAGE_CE_ACTION="soft" + +# CPU Online Fault Isolation -+# Specify the threshold of corrected errors. ++# whether to enable cpu online fault isolation (yes|no). +CPU_ISOLATION_ENABLE="yes" +# +# Format: -- Gitee From ff0192450152b3db302b802ac6fd4e3bbe53938f Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 16:57:13 +0800 Subject: [PATCH 11/15] remove else in do_cpu_offline function --- 0006-add-cpu-online-fault-isolation.patch | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 5eafe53..6fdf48c 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -726,9 +726,9 @@ index 0000000..8be4c08 + if (cpu_infos[cpu].state == CPU_OFFLINE) { + return HANDLE_SUCCEED; + } -+ else { -+ return HANDLE_FAILED; -+ } ++ ++ return HANDLE_FAILED; ++ +} + +static int do_ce_handler(unsigned cpu) -- Gitee From 1cb431979a99ca4c08f8c131fb8d9755dc96ef2f Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 17:48:02 +0800 Subject: [PATCH 12/15] remove default enable-cpu-fault-isolation and add description in changelog --- rasdaemon.spec | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rasdaemon.spec b/rasdaemon.spec index 39bc449..0a1ccb9 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -44,7 +44,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-cpu-fault-isolation +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -73,7 +73,8 @@ rm INSTALL %{buildroot}/usr/include/*.h %changelog * Fri May 21 2021 luoshengwei - 0.6.6-4 -- add cpu online fault isolation +- add cpu online fault isolation, user can enable this function +- by configure --enable-cpu-fault-isolation * Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-3 - Type:bugfix -- Gitee From ca38f8a0b987ba18cd21295257a537b103aa04a4 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 18:55:27 +0800 Subject: [PATCH 13/15] add ce/uce description --- 0006-add-cpu-online-fault-isolation.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 6fdf48c..d0fce79 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -349,7 +349,7 @@ index 2f170e2..8a0c7af 100644 + /* record cpu error */ + if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) + return -1; -+ /* error severity defined from include/acpi/ghes.h */ ++ /* refer to UEFI_2_9_2021_03_18 specification chapter N2.2 Table N-5 */ + switch (val) { + case GHES_SEV_NO: + ev.severity = "Informational"; -- Gitee From ab846ddbc838e7910dcd86c022d1c832921898c4 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 20:14:05 +0800 Subject: [PATCH 14/15] modify according to review v2 --- 0006-add-cpu-online-fault-isolation.patch | 59 +++++++++---------- ...add-trace-print-and-add-sqlite-store.patch | 6 +- rasdaemon.spec | 2 +- 3 files changed, 31 insertions(+), 36 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index d0fce79..4484052 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -1,6 +1,6 @@ -From 73853ffd21b312176b30ea95b77b0643cfd5004b Mon Sep 17 00:00:00 2001 +From 94f9581a6b398f178fcabf0fde2cce7eebb15ea7 Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Tue, 25 May 2021 15:38:35 +0800 +Date: Tue, 25 May 2021 20:05:49 +0800 Subject: [PATCH 1/2] add cpu online fault isolation Add cpu online fault isolation, when CE/UCE occurs, we choose to offline @@ -11,15 +11,15 @@ Signed-off-by: Luo Shengwei .travis.yml | 2 +- Makefile.am | 6 +- configure.ac | 11 + - misc/rasdaemon.env | 16 ++ + misc/rasdaemon.env | 17 ++ queue.c | 126 +++++++++++ queue.h | 43 ++++ ras-arm-handler.c | 72 +++++++ - ras-cpu-isolation.c | 501 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.c | 497 ++++++++++++++++++++++++++++++++++++++++++++ ras-cpu-isolation.h | 76 +++++++ - ras-events.c | 10 + + ras-events.c | 8 + ras-record.h | 5 + - 11 files changed, 866 insertions(+), 2 deletions(-) + 11 files changed, 861 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c @@ -88,17 +88,18 @@ index 2d6c59c..a682bb9 100644 + CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..ad6a96f 100644 +index 12fd766..3191d03 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -27,3 +27,19 @@ PAGE_CE_THRESHOLD="50" +@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" # soft-then-hard First try to soft offline, then try hard offlining. # Note: default offline choice is "soft". PAGE_CE_ACTION="soft" + +# CPU Online Fault Isolation -+# whether to enable cpu online fault isolation (yes|no). -+CPU_ISOLATION_ENABLE="yes" ++# Whether to enable cpu online fault isolation (yes|no). ++CPU_ISOLATION_ENABLE="no" ++# Specify the threshold of CE numbers. +# +# Format: +# [0-9]+[unit] @@ -294,7 +295,7 @@ index 0000000..9684c58 +#endif \ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 2f170e2..8a0c7af 100644 +index 2f170e2..f9baa51 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -20,6 +20,43 @@ @@ -385,10 +386,10 @@ index 2f170e2..8a0c7af 100644 ras_store_arm_record(ras, &ev); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..8be4c08 +index 0000000..153c9b4 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,501 @@ +@@ -0,0 +1,497 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -417,6 +418,7 @@ index 0000000..8be4c08 +static struct cpu_info *cpu_infos = NULL; +static unsigned int ncores, cores_per_socket, cores_per_die; +static unsigned int sockets, dies = 1; ++static unsigned int enabled = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; +static const char *node_path = "/sys/devices/system/node/possible"; @@ -540,15 +542,13 @@ index 0000000..8be4c08 + +static int get_cpu_status(unsigned cpu) +{ -+ int fd; ++ int fd, num; + fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); + + if (fd == -1) { + return CPU_UNKNOWN; + } + -+ int num; -+ + if (read(fd, &num, 1) <= 0) { + num = CPU_UNKNOWN; + } @@ -669,24 +669,22 @@ index 0000000..8be4c08 + char *env = getenv("CPU_ISOLATION_ENABLE"); + + if (env == NULL || strcasecmp(env, "yes")) { -+ log(TERM, LOG_ERR, "Config of cpu fault isolation is not on, exit\n"); ++ log(TERM, LOG_ERR, "Config of cpu fault isolation is not on\n"); + return -1; + } + + return 0; +} + -+int ras_error_count_init(unsigned cpus) ++void ras_error_count_init(unsigned cpus) +{ + if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { -+ return -1; ++ enabled = 0; + } + + init_config(&threshold); + init_config(&cpu_limit); + init_config(&cycle); -+ -+ return 0; +} + +void cpu_infos_free(void) @@ -728,7 +726,6 @@ index 0000000..8be4c08 + } + + return HANDLE_FAILED; -+ +} + +static int do_ce_handler(unsigned cpu) @@ -845,8 +842,8 @@ index 0000000..8be4c08 + unsigned cpu; + int ret; + -+ if (!cpu_infos) { -+ log(TERM, LOG_ERR, "Since the cpu_infos which record cpu information is NULL, stop here\n"); ++ if (enabled == 0) { ++ log(TERM, LOG_INFO, "The cpu fault isolation is disabled, return\n"); + return; + } + @@ -892,7 +889,7 @@ index 0000000..8be4c08 +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..9f19d01 +index 0000000..a7d3fdb --- /dev/null +++ b/ras-cpu-isolation.h @@ -0,0 +1,76 @@ @@ -967,14 +964,14 @@ index 0000000..9f19d01 + char buf[MAX_BUF_LEN]; +}; + -+int ras_error_count_init(unsigned cpus); ++void ras_error_count_init(unsigned cpus); +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); +void cpu_infos_free(void); + +#endif \ No newline at end of file diff --git a/ras-events.c b/ras-events.c -index 471d25d..2609c7b 100644 +index 471d25d..31c4170 100644 --- a/ras-events.c +++ b/ras-events.c @@ -40,6 +40,7 @@ @@ -985,20 +982,18 @@ index 471d25d..2609c7b 100644 /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -874,6 +875,12 @@ int handle_ras_events(int record_events) +@@ -874,6 +875,10 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); +#ifdef HAVE_CPU_FAULT_ISOLATION -+ if (ras_error_count_init(cpus) < 0) { -+ goto err; -+ } ++ ras_error_count_init(cpus); +#endif + #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -990,6 +997,9 @@ err: +@@ -990,6 +995,9 @@ err: } free(ras); } diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch index cd415f3..e78d0b7 100644 --- a/0007-add-trace-print-and-add-sqlite-store.patch +++ b/0007-add-trace-print-and-add-sqlite-store.patch @@ -1,6 +1,6 @@ -From 0afe0eb14aefc9ae544e1a0fe4d2559fd804aec0 Mon Sep 17 00:00:00 2001 +From 57640072aead2e00037749d66f05fc26e3fe3071 Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Tue, 25 May 2021 15:39:05 +0800 +Date: Tue, 25 May 2021 20:07:26 +0800 Subject: [PATCH 2/2] add trace print of new information and add it to sqilte Since we add new information of the event, we add trace print and store it to @@ -13,7 +13,7 @@ Signed-off-by: Luo Shengwei 2 files changed, 18 insertions(+) diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 8a0c7af..b4bcc6e 100644 +index f9baa51..fd5c541 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -23,6 +23,13 @@ diff --git a/rasdaemon.spec b/rasdaemon.spec index 0a1ccb9..0e10558 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -44,7 +44,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif -- Gitee From 69666cca89938334bad54cb6c98e667ce017b5c4 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Tue, 25 May 2021 16:35:48 +0800 Subject: [PATCH 15/15] add decription to env config remove else in do_cpu_offline function remove default enable-cpu-fault-isolation and add description in changelog add ce/uce description modify according to review v2 --- 0006-add-cpu-online-fault-isolation.patch | 65 +++++++++---------- ...add-trace-print-and-add-sqlite-store.patch | 6 +- rasdaemon.spec | 5 +- 3 files changed, 36 insertions(+), 40 deletions(-) diff --git a/0006-add-cpu-online-fault-isolation.patch b/0006-add-cpu-online-fault-isolation.patch index 518ea3e..4484052 100644 --- a/0006-add-cpu-online-fault-isolation.patch +++ b/0006-add-cpu-online-fault-isolation.patch @@ -1,6 +1,6 @@ -From 73853ffd21b312176b30ea95b77b0643cfd5004b Mon Sep 17 00:00:00 2001 +From 94f9581a6b398f178fcabf0fde2cce7eebb15ea7 Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Tue, 25 May 2021 15:38:35 +0800 +Date: Tue, 25 May 2021 20:05:49 +0800 Subject: [PATCH 1/2] add cpu online fault isolation Add cpu online fault isolation, when CE/UCE occurs, we choose to offline @@ -11,15 +11,15 @@ Signed-off-by: Luo Shengwei .travis.yml | 2 +- Makefile.am | 6 +- configure.ac | 11 + - misc/rasdaemon.env | 16 ++ + misc/rasdaemon.env | 17 ++ queue.c | 126 +++++++++++ queue.h | 43 ++++ ras-arm-handler.c | 72 +++++++ - ras-cpu-isolation.c | 501 ++++++++++++++++++++++++++++++++++++++++++++ + ras-cpu-isolation.c | 497 ++++++++++++++++++++++++++++++++++++++++++++ ras-cpu-isolation.h | 76 +++++++ - ras-events.c | 10 + + ras-events.c | 8 + ras-record.h | 5 + - 11 files changed, 866 insertions(+), 2 deletions(-) + 11 files changed, 861 insertions(+), 2 deletions(-) create mode 100644 queue.c create mode 100644 queue.h create mode 100644 ras-cpu-isolation.c @@ -88,17 +88,18 @@ index 2d6c59c..a682bb9 100644 + CPU fault isolation : $USE_CPU_FAULT_ISOLATION EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..ad6a96f 100644 +index 12fd766..3191d03 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -27,3 +27,19 @@ PAGE_CE_THRESHOLD="50" +@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" # soft-then-hard First try to soft offline, then try hard offlining. # Note: default offline choice is "soft". PAGE_CE_ACTION="soft" + +# CPU Online Fault Isolation -+# Specify the threshold of corrected errors. -+CPU_ISOLATION_ENABLE="yes" ++# Whether to enable cpu online fault isolation (yes|no). ++CPU_ISOLATION_ENABLE="no" ++# Specify the threshold of CE numbers. +# +# Format: +# [0-9]+[unit] @@ -294,7 +295,7 @@ index 0000000..9684c58 +#endif \ No newline at end of file diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 2f170e2..8a0c7af 100644 +index 2f170e2..f9baa51 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -20,6 +20,43 @@ @@ -349,7 +350,7 @@ index 2f170e2..8a0c7af 100644 + /* record cpu error */ + if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) + return -1; -+ /* error severity defined from include/acpi/ghes.h */ ++ /* refer to UEFI_2_9_2021_03_18 specification chapter N2.2 Table N-5 */ + switch (val) { + case GHES_SEV_NO: + ev.severity = "Informational"; @@ -385,10 +386,10 @@ index 2f170e2..8a0c7af 100644 ras_store_arm_record(ras, &ev); diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c new file mode 100644 -index 0000000..8be4c08 +index 0000000..153c9b4 --- /dev/null +++ b/ras-cpu-isolation.c -@@ -0,0 +1,501 @@ +@@ -0,0 +1,497 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * @@ -417,6 +418,7 @@ index 0000000..8be4c08 +static struct cpu_info *cpu_infos = NULL; +static unsigned int ncores, cores_per_socket, cores_per_die; +static unsigned int sockets, dies = 1; ++static unsigned int enabled = 1; +static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; +static const char *node_path = "/sys/devices/system/node/possible"; @@ -540,15 +542,13 @@ index 0000000..8be4c08 + +static int get_cpu_status(unsigned cpu) +{ -+ int fd; ++ int fd, num; + fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); + + if (fd == -1) { + return CPU_UNKNOWN; + } + -+ int num; -+ + if (read(fd, &num, 1) <= 0) { + num = CPU_UNKNOWN; + } @@ -669,24 +669,22 @@ index 0000000..8be4c08 + char *env = getenv("CPU_ISOLATION_ENABLE"); + + if (env == NULL || strcasecmp(env, "yes")) { -+ log(TERM, LOG_ERR, "Config of cpu fault isolation is not on, exit\n"); ++ log(TERM, LOG_ERR, "Config of cpu fault isolation is not on\n"); + return -1; + } + + return 0; +} + -+int ras_error_count_init(unsigned cpus) ++void ras_error_count_init(unsigned cpus) +{ + if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { -+ return -1; ++ enabled = 0; + } + + init_config(&threshold); + init_config(&cpu_limit); + init_config(&cycle); -+ -+ return 0; +} + +void cpu_infos_free(void) @@ -726,9 +724,8 @@ index 0000000..8be4c08 + if (cpu_infos[cpu].state == CPU_OFFLINE) { + return HANDLE_SUCCEED; + } -+ else { -+ return HANDLE_FAILED; -+ } ++ ++ return HANDLE_FAILED; +} + +static int do_ce_handler(unsigned cpu) @@ -845,8 +842,8 @@ index 0000000..8be4c08 + unsigned cpu; + int ret; + -+ if (!cpu_infos) { -+ log(TERM, LOG_ERR, "Since the cpu_infos which record cpu information is NULL, stop here\n"); ++ if (enabled == 0) { ++ log(TERM, LOG_INFO, "The cpu fault isolation is disabled, return\n"); + return; + } + @@ -892,7 +889,7 @@ index 0000000..8be4c08 +} diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h new file mode 100644 -index 0000000..9f19d01 +index 0000000..a7d3fdb --- /dev/null +++ b/ras-cpu-isolation.h @@ -0,0 +1,76 @@ @@ -967,14 +964,14 @@ index 0000000..9f19d01 + char buf[MAX_BUF_LEN]; +}; + -+int ras_error_count_init(unsigned cpus); ++void ras_error_count_init(unsigned cpus); +void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); +void cpu_infos_free(void); + +#endif \ No newline at end of file diff --git a/ras-events.c b/ras-events.c -index 471d25d..2609c7b 100644 +index 471d25d..31c4170 100644 --- a/ras-events.c +++ b/ras-events.c @@ -40,6 +40,7 @@ @@ -985,20 +982,18 @@ index 471d25d..2609c7b 100644 /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -874,6 +875,12 @@ int handle_ras_events(int record_events) +@@ -874,6 +875,10 @@ int handle_ras_events(int record_events) cpus = get_num_cpus(ras); +#ifdef HAVE_CPU_FAULT_ISOLATION -+ if (ras_error_count_init(cpus) < 0) { -+ goto err; -+ } ++ ras_error_count_init(cpus); +#endif + #ifdef HAVE_MCE rc = register_mce_handler(ras, cpus); if (rc) -@@ -990,6 +997,9 @@ err: +@@ -990,6 +995,9 @@ err: } free(ras); } diff --git a/0007-add-trace-print-and-add-sqlite-store.patch b/0007-add-trace-print-and-add-sqlite-store.patch index cd415f3..e78d0b7 100644 --- a/0007-add-trace-print-and-add-sqlite-store.patch +++ b/0007-add-trace-print-and-add-sqlite-store.patch @@ -1,6 +1,6 @@ -From 0afe0eb14aefc9ae544e1a0fe4d2559fd804aec0 Mon Sep 17 00:00:00 2001 +From 57640072aead2e00037749d66f05fc26e3fe3071 Mon Sep 17 00:00:00 2001 From: Lostwayzxc -Date: Tue, 25 May 2021 15:39:05 +0800 +Date: Tue, 25 May 2021 20:07:26 +0800 Subject: [PATCH 2/2] add trace print of new information and add it to sqilte Since we add new information of the event, we add trace print and store it to @@ -13,7 +13,7 @@ Signed-off-by: Luo Shengwei 2 files changed, 18 insertions(+) diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 8a0c7af..b4bcc6e 100644 +index f9baa51..fd5c541 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -23,6 +23,13 @@ diff --git a/rasdaemon.spec b/rasdaemon.spec index 39bc449..0e10558 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -44,7 +44,7 @@ autoheader libtoolize --automake --copy --debug --force automake --add-missing %ifarch %{arm} aarch64 -%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-cpu-fault-isolation +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror %endif @@ -73,7 +73,8 @@ rm INSTALL %{buildroot}/usr/include/*.h %changelog * Fri May 21 2021 luoshengwei - 0.6.6-4 -- add cpu online fault isolation +- add cpu online fault isolation, user can enable this function +- by configure --enable-cpu-fault-isolation * Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-3 - Type:bugfix -- Gitee