diff --git a/bugfix-modify-the-way-counting-cpu-logical-index.patch b/bugfix-modify-the-way-counting-cpu-logical-index.patch new file mode 100644 index 0000000000000000000000000000000000000000..bd6cd441100075474b3a30d3275025f1cbf99511 --- /dev/null +++ b/bugfix-modify-the-way-counting-cpu-logical-index.patch @@ -0,0 +1,234 @@ +From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001 +From: Lostwayzxc +Date: Wed, 24 Nov 2021 09:43:52 +0800 +Subject: [PATCH] modify the way counting cpu logical index + +It's hard to count cpu logical index according to the mpidr in the userspace, +so the index will be counted in the kernel before reported to userspace now. + +Related patches: +0006-add-cpu-online-fault-isolation.patch +0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch + +--- + ras-arm-handler.c | 8 ++- + ras-cpu-isolation.c | 127 ++------------------------------------------ + ras-cpu-isolation.h | 6 +-- + 3 files changed, 11 insertions(+), 130 deletions(-) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 8a229b4..47f9a57 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s, + trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); + + #ifdef HAVE_CPU_FAULT_ISOLATION ++ int cpu; ++ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) ++ return -1; ++ cpu = val; ++ trace_seq_printf(s, "\n cpu: %d", cpu); ++ + /* record cpu error */ + if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) + return -1; +@@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s, + nums = count_errors(event, ev.error_info, len); + if (nums > 0) { + struct error_info err_info = {nums, now, val}; +- ras_record_cpu_error(&err_info, ev.mpidr); ++ ras_record_cpu_error(&err_info, cpu); + } + } + #endif +diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c +index b1643c4..bca7e0b 100644 +--- a/ras-cpu-isolation.c ++++ b/ras-cpu-isolation.c +@@ -24,13 +24,9 @@ + #include "ras-cpu-isolation.h" + + static struct cpu_info *cpu_infos = NULL; +-static unsigned int ncores, cores_per_socket, cores_per_die; +-static unsigned int cores_per_cluster = 4; +-static unsigned int sockets, dies = 1; ++static unsigned int ncores; + static unsigned int enabled = 1; + static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; +-static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; +-static const char *node_path = "/sys/devices/system/node/possible"; + + static const struct param normal_units[] = { + { "", 1 }, +@@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format) + return fd; + } + +-static int get_sockets(void) +-{ +- int fd, j; +- char buf[MAX_BUF_LEN] = ""; +- cores_per_socket = ncores; +- struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); +- +- if (!cpu_sets) { +- log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); +- return -1; +- } +- +- for (int i = 0; i < ncores; ++i) { +- fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); +- if (fd == -1) { +- continue; +- } +- memset(buf, '\0', strlen(buf)); +- if (read(fd, buf, sizeof(buf)) <= 0) { +- close(fd); +- continue; +- } +- for (j = 0; j < sockets; ++j) { +- if (strcmp(cpu_sets[j].buf, buf) == 0) { +- break; +- } +- } +- if (j == sockets) { +- strcpy(cpu_sets[sockets].buf, buf); +- sockets++; +- } +- close(fd); +- } +- +- free(cpu_sets); +- cores_per_socket = sockets > 0 ? ncores / sockets : ncores; +- +- return 0; +-} +- +-static int get_dies(void) +-{ +- int fd, begin, end; +- char buf[20] = ""; +- cores_per_die = ncores; +- fd = open(node_path, O_RDONLY); +- +- if (fd == -1) { +- return -1; +- } +- +- if (read(fd, buf, sizeof(buf))) { +- if (sscanf(buf, "%d-%d", &begin, &end) == 2) { +- dies = end > begin ? end - begin + 1 : 1; +- } +- } +- +- close(fd); +- cores_per_die = ncores / dies; +- +- return 0; +-} +- + static int get_cpu_status(unsigned cpu) + { + int fd, num; +@@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus) + cpu_limit.limit = cpus - 1; + cpu_limit.value = 0; + +- if (get_sockets() < 0 || get_dies() < 0) { +- log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); +- return -1; +- } +- + return 0; + } + +@@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info) + } + } + +-static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) ++void ras_record_cpu_error(struct error_info *err_info, int cpu) + { +- value >>= offset; +- unsigned long res = 0; +- int i = 0; +- +- while (i < size) { +- res |= (value & (0x1 << (i++))); +- } +- +- return res; +-} +- +-static unsigned get_cpu_index(int64_t mpidr) +-{ +- unsigned core_id, cluster_id, socket_id, die_id, cpu; +- /* +- * Adapt to certain BIOS +- * In the MPIDR: +- * bit 8:15: core id +- * bit 16:18: cluster id +- * bit 19:20: die_id +- * bit 21:22: socket_id +- */ +- core_id = get_bit_value(mpidr, 8, 8); +- cluster_id = get_bit_value(mpidr, 16, 3); +- socket_id = get_bit_value(mpidr, 21, 2); +- die_id = get_bit_value(mpidr, 19, 2); +- +- /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3, +- * it means TotemB. When cores per die equal to cores per socket, it means +- * that there is only one die in the socket, in case that the only die is +- * TotemB in CPU 1620s, we set die id to 0 directly. +- */ +- if (cores_per_die == cores_per_socket) { +- die_id = 0; +- } +- else { +- die_id = (die_id == 1 ? 0:1); +- } +- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die + +- cluster_id * cores_per_cluster; +- +- return cpu; +-} +- +-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) +-{ +- unsigned cpu; + int ret; + + if (enabled == 0) { + return; + } + +- cpu = get_cpu_index(mpidr); +- +- if (cpu >= ncores) { ++ if (cpu >= ncores || cpu < 0) { + log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); + return; + } +diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h +index a7d3fdb..95dedc1 100644 +--- a/ras-cpu-isolation.h ++++ b/ras-cpu-isolation.h +@@ -65,12 +65,8 @@ struct error_info { + enum error_type err_type; + }; + +-struct cpu_set { +- char buf[MAX_BUF_LEN]; +-}; +- + void ras_error_count_init(unsigned cpus); +-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); ++void ras_record_cpu_error(struct error_info *err_info, int cpu); + void cpu_infos_free(void); + + #endif +\ No newline at end of file +-- +2.27.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index a4924ec670d91c02c1a58bbf21c24f7e5a359d6f..81843b03c842fa2a1ef8984dab0e03b42ef0fcc4 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.6 -Release: 6 +Release: 7 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -35,6 +35,7 @@ Patch13: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch Patch14: 0006-add-cpu-online-fault-isolation.patch Patch15: 0007-add-trace-print-and-add-sqlite-store.patch Patch16: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch +Patch17: bugfix-modify-the-way-counting-cpu-logical-index.patch %description The rasdaemon program is a daemon which monitors the platform @@ -81,6 +82,13 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Wed Dec 1 2021 luoshengwei - 0.6.6-7 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC: Since the cpu logical index has been counted in kernel, remove +- related code in ras. + * Wed Oct 27 2021 luoshengwei - 0.6.6-6 - Type:feature - ID:NA