diff --git a/0021-rasdaemon-add-rbtree-support-for-page-record.patch b/0021-rasdaemon-add-rbtree-support-for-page-record.patch deleted file mode 100644 index 4a76a899096c15ab6614fa872d76f344b8970caa..0000000000000000000000000000000000000000 --- a/0021-rasdaemon-add-rbtree-support-for-page-record.patch +++ /dev/null @@ -1,584 +0,0 @@ -From 27794f4a5ff1453490bbcd805ad8e5b54516f015 Mon Sep 17 00:00:00 2001 -From: wuyun -Date: Sat, 20 Jun 2020 20:26:21 +0800 -Subject: [PATCH] rasdaemon: add rbtree support for page record - -commit 5fd96f457262052f7d06435af8a49689ffb6ffcf upstream - -The rbtree is very efficient for recording and querying fault page info. - -Signed-off-by: wuyun -Signed-off-by: lvying6 -Signed-off-by: Mauro Carvalho Chehab -Signed-off-by: Bixuan Cui ---- - rbtree.c | 384 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - rbtree.h | 165 +++++++++++++++++++++++++++ - 2 files changed, 549 insertions(+) - create mode 100644 rbtree.c - create mode 100644 rbtree.h - -diff --git a/rbtree.c b/rbtree.c -new file mode 100644 -index 0000000..d9b1bd4 ---- /dev/null -+++ b/rbtree.c -@@ -0,0 +1,384 @@ -+/* -+ Red Black Trees -+ (C) 1999 Andrea Arcangeli -+ (C) 2002 David Woodhouse -+ Taken from the Linux 2.6.30 source with some minor modificatons. -+ -+ This program is free software; you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 2 of the License, or -+ (at your option) any later version. -+ -+ This program is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software -+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ -+ linux/lib/rbtree.c -+*/ -+ -+#include "rbtree.h" -+ -+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *right = node->rb_right; -+ struct rb_node *parent = rb_parent(node); -+ -+ if ((node->rb_right = right->rb_left)) -+ rb_set_parent(right->rb_left, node); -+ right->rb_left = node; -+ -+ rb_set_parent(right, parent); -+ -+ if (parent) -+ { -+ if (node == parent->rb_left) -+ parent->rb_left = right; -+ else -+ parent->rb_right = right; -+ } -+ else -+ root->rb_node = right; -+ rb_set_parent(node, right); -+} -+ -+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *left = node->rb_left; -+ struct rb_node *parent = rb_parent(node); -+ -+ if ((node->rb_left = left->rb_right)) -+ rb_set_parent(left->rb_right, node); -+ left->rb_right = node; -+ -+ rb_set_parent(left, parent); -+ -+ if (parent) -+ { -+ if (node == parent->rb_right) -+ parent->rb_right = left; -+ else -+ parent->rb_left = left; -+ } -+ else -+ root->rb_node = left; -+ rb_set_parent(node, left); -+} -+ -+void rb_insert_color(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *parent, *gparent; -+ -+ while ((parent = rb_parent(node)) && rb_is_red(parent)) -+ { -+ gparent = rb_parent(parent); -+ -+ if (parent == gparent->rb_left) -+ { -+ { -+ register struct rb_node *uncle = gparent->rb_right; -+ if (uncle && rb_is_red(uncle)) -+ { -+ rb_set_black(uncle); -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ node = gparent; -+ continue; -+ } -+ } -+ -+ if (parent->rb_right == node) -+ { -+ struct rb_node *tmp; -+ __rb_rotate_left(parent, root); -+ tmp = parent; -+ parent = node; -+ node = tmp; -+ } -+ -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ __rb_rotate_right(gparent, root); -+ } else { -+ { -+ struct rb_node *uncle = gparent->rb_left; -+ if (uncle && rb_is_red(uncle)) -+ { -+ rb_set_black(uncle); -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ node = gparent; -+ continue; -+ } -+ } -+ -+ if (parent->rb_left == node) -+ { -+ struct rb_node *tmp; -+ __rb_rotate_right(parent, root); -+ tmp = parent; -+ parent = node; -+ node = tmp; -+ } -+ -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ __rb_rotate_left(gparent, root); -+ } -+ } -+ -+ rb_set_black(root->rb_node); -+} -+ -+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, -+ struct rb_root *root) -+{ -+ struct rb_node *other; -+ -+ while ((!node || rb_is_black(node)) && node != root->rb_node) -+ { -+ if (parent->rb_left == node) -+ { -+ other = parent->rb_right; -+ if (rb_is_red(other)) -+ { -+ rb_set_black(other); -+ rb_set_red(parent); -+ __rb_rotate_left(parent, root); -+ other = parent->rb_right; -+ } -+ if ((!other->rb_left || rb_is_black(other->rb_left)) && -+ (!other->rb_right || rb_is_black(other->rb_right))) -+ { -+ rb_set_red(other); -+ node = parent; -+ parent = rb_parent(node); -+ } -+ else -+ { -+ if (!other->rb_right || rb_is_black(other->rb_right)) -+ { -+ rb_set_black(other->rb_left); -+ rb_set_red(other); -+ __rb_rotate_right(other, root); -+ other = parent->rb_right; -+ } -+ rb_set_color(other, rb_color(parent)); -+ rb_set_black(parent); -+ rb_set_black(other->rb_right); -+ __rb_rotate_left(parent, root); -+ node = root->rb_node; -+ break; -+ } -+ } -+ else -+ { -+ other = parent->rb_left; -+ if (rb_is_red(other)) -+ { -+ rb_set_black(other); -+ rb_set_red(parent); -+ __rb_rotate_right(parent, root); -+ other = parent->rb_left; -+ } -+ if ((!other->rb_left || rb_is_black(other->rb_left)) && -+ (!other->rb_right || rb_is_black(other->rb_right))) -+ { -+ rb_set_red(other); -+ node = parent; -+ parent = rb_parent(node); -+ } -+ else -+ { -+ if (!other->rb_left || rb_is_black(other->rb_left)) -+ { -+ rb_set_black(other->rb_right); -+ rb_set_red(other); -+ __rb_rotate_left(other, root); -+ other = parent->rb_left; -+ } -+ rb_set_color(other, rb_color(parent)); -+ rb_set_black(parent); -+ rb_set_black(other->rb_left); -+ __rb_rotate_right(parent, root); -+ node = root->rb_node; -+ break; -+ } -+ } -+ } -+ if (node) -+ rb_set_black(node); -+} -+ -+void rb_erase(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *child, *parent; -+ int color; -+ -+ if (!node->rb_left) -+ child = node->rb_right; -+ else if (!node->rb_right) -+ child = node->rb_left; -+ else -+ { -+ struct rb_node *old = node, *left; -+ -+ node = node->rb_right; -+ while ((left = node->rb_left) != NULL) -+ node = left; -+ child = node->rb_right; -+ parent = rb_parent(node); -+ color = rb_color(node); -+ -+ if (child) -+ rb_set_parent(child, parent); -+ if (parent == old) { -+ parent->rb_right = child; -+ parent = node; -+ } else -+ parent->rb_left = child; -+ -+ node->rb_parent_color = old->rb_parent_color; -+ node->rb_right = old->rb_right; -+ node->rb_left = old->rb_left; -+ -+ if (rb_parent(old)) -+ { -+ if (rb_parent(old)->rb_left == old) -+ rb_parent(old)->rb_left = node; -+ else -+ rb_parent(old)->rb_right = node; -+ } else -+ root->rb_node = node; -+ -+ rb_set_parent(old->rb_left, node); -+ if (old->rb_right) -+ rb_set_parent(old->rb_right, node); -+ goto color; -+ } -+ -+ parent = rb_parent(node); -+ color = rb_color(node); -+ -+ if (child) -+ rb_set_parent(child, parent); -+ if (parent) -+ { -+ if (parent->rb_left == node) -+ parent->rb_left = child; -+ else -+ parent->rb_right = child; -+ } -+ else -+ root->rb_node = child; -+ -+ color: -+ if (color == RB_BLACK) -+ __rb_erase_color(child, parent, root); -+} -+ -+/* -+ * This function returns the first node (in sort order) of the tree. -+ */ -+struct rb_node *rb_first(const struct rb_root *root) -+{ -+ struct rb_node *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return NULL; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+} -+ -+struct rb_node *rb_last(const struct rb_root *root) -+{ -+ struct rb_node *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return NULL; -+ while (n->rb_right) -+ n = n->rb_right; -+ return n; -+} -+ -+struct rb_node *rb_next(const struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+ if (rb_parent(node) == node) -+ return NULL; -+ -+ /* If we have a right-hand child, go down and then left as far -+ as we can. */ -+ if (node->rb_right) { -+ node = node->rb_right; -+ while (node->rb_left) -+ node=node->rb_left; -+ return (struct rb_node *)node; -+ } -+ -+ /* No right-hand children. Everything down and left is -+ smaller than us, so any 'next' node must be in the general -+ direction of our parent. Go up the tree; any time the -+ ancestor is a right-hand child of its parent, keep going -+ up. First time it's a left-hand child of its parent, said -+ parent is our 'next' node. */ -+ while ((parent = rb_parent(node)) && node == parent->rb_right) -+ node = parent; -+ -+ return parent; -+} -+ -+struct rb_node *rb_prev(const struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+ if (rb_parent(node) == node) -+ return NULL; -+ -+ /* If we have a left-hand child, go down and then right as far -+ as we can. */ -+ if (node->rb_left) { -+ node = node->rb_left; -+ while (node->rb_right) -+ node=node->rb_right; -+ return (struct rb_node *)node; -+ } -+ -+ /* No left-hand children. Go up till we find an ancestor which -+ is a right-hand child of its parent */ -+ while ((parent = rb_parent(node)) && node == parent->rb_left) -+ node = parent; -+ -+ return parent; -+} -+ -+void rb_replace_node(struct rb_node *victim, struct rb_node *new, -+ struct rb_root *root) -+{ -+ struct rb_node *parent = rb_parent(victim); -+ -+ /* Set the surrounding nodes to point to the replacement */ -+ if (parent) { -+ if (victim == parent->rb_left) -+ parent->rb_left = new; -+ else -+ parent->rb_right = new; -+ } else { -+ root->rb_node = new; -+ } -+ if (victim->rb_left) -+ rb_set_parent(victim->rb_left, new); -+ if (victim->rb_right) -+ rb_set_parent(victim->rb_right, new); -+ -+ /* Copy the pointers/colour from the victim to the replacement */ -+ *new = *victim; -+} -diff --git a/rbtree.h b/rbtree.h -new file mode 100644 -index 0000000..a8a0459 ---- /dev/null -+++ b/rbtree.h -@@ -0,0 +1,165 @@ -+/* -+ Red Black Trees -+ (C) 1999 Andrea Arcangeli -+ Taken from the Linux 2.6.30 source. -+ -+ This program is free software; you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 2 of the License, or -+ (at your option) any later version. -+ -+ This program is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software -+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ -+ linux/include/linux/rbtree.h -+ -+ To use rbtrees you'll have to implement your own insert and search cores. -+ This will avoid us to use callbacks and to drop drammatically performances. -+ I know it's not the cleaner way, but in C (not in C++) to get -+ performances and genericity... -+ -+ Some example of insert and search follows here. The search is a plain -+ normal search over an ordered tree. The insert instead must be implemented -+ int two steps: as first thing the code must insert the element in -+ order as a red leaf in the tree, then the support library function -+ rb_insert_color() must be called. Such function will do the -+ not trivial work to rebalance the rbtree if necessary. -+ -+----------------------------------------------------------------------- -+static inline struct page * rb_search_page_cache(struct inode * inode, -+ unsigned long offset) -+{ -+ struct rb_node * n = inode->i_rb_page_cache.rb_node; -+ struct page * page; -+ -+ while (n) -+ { -+ page = rb_entry(n, struct page, rb_page_cache); -+ -+ if (offset < page->offset) -+ n = n->rb_left; -+ else if (offset > page->offset) -+ n = n->rb_right; -+ else -+ return page; -+ } -+ return NULL; -+} -+ -+static inline struct page * __rb_insert_page_cache(struct inode * inode, -+ unsigned long offset, -+ struct rb_node * node) -+{ -+ struct rb_node ** p = &inode->i_rb_page_cache.rb_node; -+ struct rb_node * parent = NULL; -+ struct page * page; -+ -+ while (*p) -+ { -+ parent = *p; -+ page = rb_entry(parent, struct page, rb_page_cache); -+ -+ if (offset < page->offset) -+ p = &(*p)->rb_left; -+ else if (offset > page->offset) -+ p = &(*p)->rb_right; -+ else -+ return page; -+ } -+ -+ rb_link_node(node, parent, p); -+ -+ return NULL; -+} -+ -+static inline struct page * rb_insert_page_cache(struct inode * inode, -+ unsigned long offset, -+ struct rb_node * node) -+{ -+ struct page * ret; -+ if ((ret = __rb_insert_page_cache(inode, offset, node))) -+ goto out; -+ rb_insert_color(node, &inode->i_rb_page_cache); -+ out: -+ return ret; -+} -+----------------------------------------------------------------------- -+*/ -+ -+#ifndef _LINUX_RBTREE_H -+#define _LINUX_RBTREE_H -+ -+#include -+ -+#define container_of(ptr, type, member) ({ \ -+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ -+ (type *)( (char *)__mptr - offsetof(type,member) );}) -+ -+struct rb_node -+{ -+ unsigned long rb_parent_color; -+#define RB_RED 0 -+#define RB_BLACK 1 -+ struct rb_node *rb_right; -+ struct rb_node *rb_left; -+} __attribute__((aligned(sizeof(long)))); -+ /* The alignment might seem pointless, but allegedly CRIS needs it */ -+ -+struct rb_root -+{ -+ struct rb_node *rb_node; -+}; -+ -+ -+#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) -+#define rb_color(r) ((r)->rb_parent_color & 1) -+#define rb_is_red(r) (!rb_color(r)) -+#define rb_is_black(r) rb_color(r) -+#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) -+#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) -+ -+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -+{ -+ rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; -+} -+static inline void rb_set_color(struct rb_node *rb, int color) -+{ -+ rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; -+} -+ -+#define RB_ROOT (struct rb_root) { NULL, } -+#define rb_entry(ptr, type, member) container_of(ptr, type, member) -+ -+#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -+#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -+#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) -+ -+extern void rb_insert_color(struct rb_node *, struct rb_root *); -+extern void rb_erase(struct rb_node *, struct rb_root *); -+ -+/* Find logical next and previous nodes in a tree */ -+extern struct rb_node *rb_next(const struct rb_node *); -+extern struct rb_node *rb_prev(const struct rb_node *); -+extern struct rb_node *rb_first(const struct rb_root *); -+extern struct rb_node *rb_last(const struct rb_root *); -+ -+/* Fast replacement of a single node without remove/rebalance/add/rebalance */ -+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, -+ struct rb_root *root); -+ -+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, -+ struct rb_node ** rb_link) -+{ -+ node->rb_parent_color = (unsigned long )parent; -+ node->rb_left = node->rb_right = NULL; -+ -+ *rb_link = node; -+} -+ -+#endif /* _LINUX_RBTREE_H */ --- -1.8.3.1 - diff --git a/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch b/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch deleted file mode 100644 index 531865f9bb7d68138fb61c07b50ce997fce24181..0000000000000000000000000000000000000000 --- a/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch +++ /dev/null @@ -1,646 +0,0 @@ -From c62d0466b0e69ac8c724c9d917000f18aa147aae Mon Sep 17 00:00:00 2001 -From: wuyun -Date: Sat, 20 Jun 2020 20:26:22 +0800 -Subject: [PATCH] rasdaemon: add support for memory Corrected Error predictive failure analysis - -commit 9ae6b70effb8adc9572debc800b8e16173f74bb8 upstream - -Memory Corrected Error was corrected by hardware. These errors do not -require immediate software actions, but are still reported for -accounting and predictive failure analysis. - -Based on statistical results, some actions can be taken to prevent -Corrected Error from evoluting to Uncorrected Error. - -Signed-off-by: wuyun -Signed-off-by: lvying6 -Signed-off-by: Mauro Carvalho Chehab -Signed-off-by: Bixuan Cui ---- - Makefile.am | 7 +- - configure.ac | 12 ++ - man/rasdaemon.1.in | 7 + - misc/rasdaemon.env | 29 ++++ - misc/rasdaemon.service.in | 1 + - misc/rasdaemon.spec.in | 4 +- - ras-events.c | 6 + - ras-mc-handler.c | 7 + - ras-page-isolation.c | 332 ++++++++++++++++++++++++++++++++++++++ - ras-page-isolation.h | 66 ++++++++ - 10 files changed, 468 insertions(+), 3 deletions(-) - create mode 100644 misc/rasdaemon.env - create mode 100644 ras-page-isolation.c - create mode 100644 ras-page-isolation.h - -diff --git a/Makefile.am b/Makefile.am -index fccdeba..dc30ae7 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -1,6 +1,6 @@ - ACLOCAL_AMFLAGS=-I m4 - SUBDIRS = libtrace util man --SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in -+SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env - SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) - EXTRA_DIST = $(SYSTEMD_SERVICES_IN) - -@@ -51,13 +51,16 @@ endif - if WITH_HISI_NS_DECODE - rasdaemon_SOURCES += non-standard-hisi_hip07.c - endif -+if WITH_MEMORY_CE_PFA -+ rasdaemon_SOURCES += rbtree.c ras-page-isolation.c -+endif - - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ -- ras-memory-failure-handler.h -+ ras-memory-failure-handler.h rbtree.h ras-page-isolation.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -diff --git a/configure.ac b/configure.ac -index 8be33d9..1f95459 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -108,6 +108,17 @@ AS_IF([test "x$enable_hisi_ns_decode" = "xyes"], [ - ]) - AM_CONDITIONAL([WITH_HISI_NS_DECODE], [test x$enable_hisi_ns_decode = xyes]) - -+AC_ARG_ENABLE([memory_ce_pfa], -+ AS_HELP_STRING([--enable-memory-ce-pfa], [enable memory Corrected Error predictive failure analysis])) -+ -+AS_IF([test "x$enable_memory_ce_pfa" = "xyes"], [ -+ AC_DEFINE(HAVE_MEMORY_CE_PFA,1,"have memory corrected error predictive failure analysis") -+ AC_SUBST([WITH_MEMORY_CE_PFA]) -+]) -+AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes]) -+AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"]) -+ -+ - test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc - - CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -138,4 +149,5 @@ compile time options summary - HIP07 SAS HW errors : $enable_hisi_ns_decode - ARM events : $enable_arm - Memory Failure : $USE_MEMORY_FAILURE -+ Memory CE PFA : $enable_memory_ce_pfa - EOF -diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in -index 834df16..833c8e1 100644 ---- a/man/rasdaemon.1.in -+++ b/man/rasdaemon.1.in -@@ -62,6 +62,13 @@ feature. - .BI "--version" - Print the program version and exit. - -+.SH CONFIG FILE -+ -+The \fBrasdaemon\fR program supports a config file to set rasdaemon systemd service -+environment variables. By default the config file is read from /etc/sysconfig/rasdaemon. -+ -+The general format is environmentname=value. -+ - .SH SEE ALSO - \fBras-mc-ctl\fR(8) - -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -new file mode 100644 -index 0000000..12fd766 ---- /dev/null -+++ b/misc/rasdaemon.env -@@ -0,0 +1,29 @@ -+# Page Isolation -+# Note: Run-time configuration is unsupported, service restart needed. -+# Note: this file should be installed at /etc/sysconfig/rasdaemon -+ -+# Specify the threshold of isolating buggy pages. -+# -+# Format: -+# [0-9]+[unit] -+# Notice: please make sure match this format, rasdaemon will use default value for exception input cases. -+# -+# Supported units: -+# PAGE_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour -+# PAGE_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none -+# -+# The two configs will only take no effect when PAGE_CE_ACTION is "off". -+PAGE_CE_REFRESH_CYCLE="24h" -+PAGE_CE_THRESHOLD="50" -+ -+# Specify the internal action in rasdaemon to exceeding a page error threshold. -+# -+# off no action -+# account only account errors -+# soft try to soft-offline page without killing any processes -+# This requires an uptodate kernel. Might not be successfull. -+# hard try to hard-offline page by killing processes -+# Requires an uptodate kernel. Might not be successfull. -+# soft-then-hard First try to soft offline, then try hard offlining. -+# Note: default offline choice is "soft". -+PAGE_CE_ACTION="soft" -diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in -index be9ad5a..e73a08a 100644 ---- a/misc/rasdaemon.service.in -+++ b/misc/rasdaemon.service.in -@@ -3,6 +3,7 @@ Description=RAS daemon to log the RAS events - After=syslog.target - - [Service] -+EnvironmentFile=/etc/sysconfig/rasdaemon - ExecStart=@sbindir@/rasdaemon -f -r - ExecStartPost=@sbindir@/rasdaemon --enable - ExecStop=@sbindir@/rasdaemon --disable -diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in -index 82fae30..f5faffe 100644 ---- a/misc/rasdaemon.spec.in -+++ b/misc/rasdaemon.spec.in -@@ -36,12 +36,13 @@ an utility for reporting current error counts from the EDAC sysfs files. - %setup -q - - %build --%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm -+%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-ce-pfa - - make %{?_smp_mflags} - - %install - make install DESTDIR=%{buildroot} -+install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} - install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service - install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service - rm INSTALL %{buildroot}/usr/include/*.h -@@ -54,6 +55,7 @@ rm INSTALL %{buildroot}/usr/include/*.h - %{_unitdir}/*.service - %{_sharedstatedir}/rasdaemon - %{_sysconfdir}/ras/dimm_labels.d -+%config(noreplace) %{_sysconfdir}/sysconfig/%{name} - - %changelog - -diff --git a/ras-events.c b/ras-events.c -index 27ac1ab..5113c32 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -36,6 +36,7 @@ - #include "ras-memory-failure-handler.h" - #include "ras-record.h" - #include "ras-logger.h" -+#include "ras-page-isolation.h" - - /* - * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -673,6 +674,11 @@ int handle_ras_events(int record_events) - ras->page_size = page_size; - ras->record_events = record_events; - -+#ifdef HAVE_MEMORY_CE_PFA -+ /* FIXME: enable memory isolation unconditionally */ -+ ras_page_account_init(); -+#endif -+ - rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", - ras_mc_event_handler); - if (!rc) -diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index deb7e05..42b05cd 100644 ---- a/ras-mc-handler.c -+++ b/ras-mc-handler.c -@@ -23,6 +23,7 @@ - #include "ras-mc-handler.h" - #include "ras-record.h" - #include "ras-logger.h" -+#include "ras-page-isolation.h" - #include "ras-report.h" - - int ras_mc_event_handler(struct trace_seq *s, -@@ -183,6 +184,12 @@ int ras_mc_event_handler(struct trace_seq *s, - - ras_store_mc_event(ras, &ev); - -+#ifdef HAVE_MEMORY_CE_PFA -+ /* Account page corrected errors */ -+ if (!strcmp(ev.error_type, "Corrected")) -+ ras_record_page_error(ev.address, ev.error_count, now); -+#endif -+ - #ifdef HAVE_ABRT_REPORT - /* Report event to ABRT */ - ras_report_mc_event(ras, &ev); -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -new file mode 100644 -index 0000000..50e4406 ---- /dev/null -+++ b/ras-page-isolation.c -@@ -0,0 +1,332 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+*/ -+ -+#include -+#include -+#include -+#include -+#include -+#include "ras-logger.h" -+#include "ras-page-isolation.h" -+ -+#define PARSED_ENV_LEN 50 -+static const struct config threshold_units[] = { -+ { "m", 1000 }, -+ { "k", 1000 }, -+ { "", 1 }, -+ {} -+}; -+ -+static const struct config cycle_units[] = { -+ { "d", 24 }, -+ { "h", 60 }, -+ { "m", 60 }, -+ { "s", 1 }, -+ {} -+}; -+ -+static struct isolation threshold = { -+ .name = "PAGE_CE_THRESHOLD", -+ .units = threshold_units, -+ .env = "50", -+ .unit = "", -+}; -+ -+static struct isolation cycle = { -+ .name = "PAGE_CE_REFRESH_CYCLE", -+ .units = cycle_units, -+ .env = "24h", -+ .unit = "h", -+}; -+ -+static const char *kernel_offline[] = { -+ [OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page", -+ [OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page", -+ [OFFLINE_SOFT_THEN_HARD] = "/sys/devices/system/memory/soft_offline_page", -+}; -+ -+static const struct config offline_choice[] = { -+ { "off", OFFLINE_OFF }, -+ { "account", OFFLINE_ACCOUNT }, -+ { "soft", OFFLINE_SOFT }, -+ { "hard", OFFLINE_HARD }, -+ { "soft-then-hard", OFFLINE_SOFT_THEN_HARD }, -+ {} -+}; -+ -+static const char *page_state[] = { -+ [PAGE_ONLINE] = "online", -+ [PAGE_OFFLINE] = "offlined", -+ [PAGE_OFFLINE_FAILED] = "offline-failed", -+}; -+ -+static enum otype offline = OFFLINE_SOFT; -+static struct rb_root page_records; -+ -+static void page_offline_init(void) -+{ -+ const char *env = "PAGE_CE_ACTION"; -+ char *choice = getenv(env); -+ const struct config *c = NULL; -+ int matched = 0; -+ -+ if (choice) { -+ for (c = offline_choice; c->name; c++) { -+ if (!strcasecmp(choice, c->name)) { -+ offline = c->val; -+ matched = 1; -+ break; -+ } -+ } -+ } -+ -+ if (!matched) -+ log(TERM, LOG_INFO, "Improper %s, set to default soft\n", env); -+ -+ if (offline > OFFLINE_ACCOUNT && access(kernel_offline[offline], W_OK)) { -+ log(TERM, LOG_INFO, "Kernel does not support page offline interface\n"); -+ offline = OFFLINE_ACCOUNT; -+ } -+ -+ log(TERM, LOG_INFO, "Page offline choice on Corrected Errors is %s\n", -+ offline_choice[offline].name); -+} -+ -+static void parse_isolation_env(struct isolation *config) -+{ -+ char *env = getenv(config->name); -+ char *unit = NULL; -+ const struct config *units = NULL; -+ int i, no_unit; -+ int valid = 0; -+ int unit_matched = 0; -+ unsigned long value, tmp; -+ -+ /* check if env is vaild */ -+ if (env && strlen(env)) { -+ /* All the character before unit must be digit */ -+ for (i = 0; i < strlen(env) - 1; i++) { -+ if (!isdigit(env[i])) -+ goto parse; -+ } -+ if (sscanf(env, "%lu", &value) < 1 || !value) -+ goto parse; -+ /* check if the unit is vaild */ -+ unit = env + strlen(env) - 1; -+ /* no unit, all the character are value character */ -+ if (isdigit(*unit)) { -+ valid = 1; -+ no_unit = 1; -+ goto parse; -+ } -+ for (units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ valid = 1; -+ no_unit = 0; -+ break; -+ } -+ } -+ } -+ -+parse: -+ /* if invalid, use default env */ -+ if (valid) { -+ config->env = env; -+ if (!no_unit) -+ config->unit = unit; -+ } else { -+ log(TERM, LOG_INFO, "Improper %s, set to default %s.\n", -+ config->name, config->env); -+ } -+ -+ /* if env value string is greater than ulong_max, truncate the last digit */ -+ sscanf(config->env, "%lu", &value); -+ for (units = config->units; units->name; units++) { -+ if (!strcasecmp(config->unit, units->name)) -+ unit_matched = 1; -+ if (unit_matched) { -+ tmp = value; -+ value *= units->val; -+ if (tmp != 0 && value / tmp != units->val) -+ config->overflow = true; -+ } -+ } -+ config->val = value; -+ /* In order to output value and unit perfectly */ -+ config->unit = no_unit ? config->unit : ""; -+} -+ -+static void parse_env_string(struct isolation *config, char *str) -+{ -+ int i; -+ -+ if (config->overflow) { -+ /* when overflow, use basic unit */ -+ for (i = 0; config->units[i].name; i++) ; -+ sprintf(str, "%lu%s", config->val, config->units[i-1].name); -+ log(TERM, LOG_INFO, "%s is set overflow(%s), truncate it\n", -+ config->name, config->env); -+ } else { -+ sprintf(str, "%s%s", config->env, config->unit); -+ } -+} -+ -+static void page_isolation_init(void) -+{ -+ char threshold_string[PARSED_ENV_LEN]; -+ char cycle_string[PARSED_ENV_LEN]; -+ /** -+ * It's unnecessary to parse threshold configuration when offline -+ * choice is off. -+ */ -+ if (offline == OFFLINE_OFF) -+ return; -+ -+ parse_isolation_env(&threshold); -+ parse_isolation_env(&cycle); -+ parse_env_string(&threshold, threshold_string); -+ parse_env_string(&cycle, cycle_string); -+ log(TERM, LOG_INFO, "Threshold of memory Corrected Errors is %s / %s\n", -+ threshold_string, cycle_string); -+} -+ -+void ras_page_account_init(void) -+{ -+ page_offline_init(); -+ page_isolation_init(); -+} -+ -+static int do_page_offline(unsigned long long addr, enum otype type) -+{ -+ FILE *offline_file; -+ int err; -+ -+ offline_file = fopen(kernel_offline[type], "w"); -+ if (!offline_file) -+ return -1; -+ -+ fprintf(offline_file, "%#llx", addr); -+ err = ferror(offline_file) ? -1 : 0; -+ fclose(offline_file); -+ -+ return err; -+} -+ -+static void page_offline(struct page_record *pr) -+{ -+ unsigned long long addr = pr->addr; -+ int ret; -+ -+ /* Offlining page is not required */ -+ if (offline <= OFFLINE_ACCOUNT) -+ return; -+ -+ /* Ignore offlined pages */ -+ if (pr->offlined != PAGE_ONLINE) -+ return; -+ -+ /* Time to silence this noisy page */ -+ if (offline == OFFLINE_SOFT_THEN_HARD) { -+ ret = do_page_offline(addr, OFFLINE_SOFT); -+ if (ret < 0) -+ ret = do_page_offline(addr, OFFLINE_HARD); -+ } else { -+ ret = do_page_offline(addr, offline); -+ } -+ -+ pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; -+ -+ log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", -+ addr, page_state[pr->offlined]); -+} -+ -+static void page_record(struct page_record *pr, unsigned count, time_t time) -+{ -+ unsigned long period = time - pr->start; -+ unsigned long tolerate; -+ -+ if (period >= cycle.val) { -+ /** -+ * Since we don't refresh automatically, it is possible that the period -+ * between two occurences will be longer than the pre-configured refresh cycle. -+ * In this case, we tolerate the frequency of the whole period up to -+ * the pre-configured threshold. -+ */ -+ tolerate = (period / (double)cycle.val) * threshold.val; -+ pr->count -= (tolerate > pr->count) ? pr->count : tolerate; -+ pr->start = time; -+ pr->excess = 0; -+ } -+ -+ pr->count += count; -+ if (pr->count >= threshold.val) { -+ log(TERM, LOG_INFO, "Corrected Errors at %#llx exceeded threshold\n", pr->addr); -+ -+ /** -+ * Backup ce count of current cycle to enable next round, which actually -+ * should never happen if we can disable overflow completely in the same -+ * time unit (but sadly we can't). -+ */ -+ pr->excess += pr->count; -+ pr->count = 0; -+ page_offline(pr); -+ } -+} -+ -+static struct page_record *page_lookup_insert(unsigned long long addr) -+{ -+ struct rb_node **entry = &page_records.rb_node; -+ struct rb_node *parent = NULL; -+ struct page_record *pr = NULL, *find = NULL; -+ -+ while (*entry) { -+ parent = *entry; -+ pr = rb_entry(parent, struct page_record, entry); -+ if (addr == pr->addr) { -+ return pr; -+ } else if (addr < pr->addr) { -+ entry = &(*entry)->rb_left; -+ } else { -+ entry = &(*entry)->rb_right; -+ } -+ } -+ -+ find = calloc(1, sizeof(struct page_record)); -+ if (!find) { -+ log(TERM, LOG_ERR, "No memory for page records\n"); -+ return NULL; -+ } -+ -+ find->addr = addr; -+ rb_link_node(&find->entry, parent, entry); -+ rb_insert_color(&find->entry, &page_records); -+ -+ return find; -+} -+ -+void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) -+{ -+ struct page_record *pr = NULL; -+ -+ if (offline == OFFLINE_OFF) -+ return; -+ -+ pr = page_lookup_insert(addr & PAGE_MASK); -+ if (pr) { -+ if (!pr->start) -+ pr->start = time; -+ page_record(pr, count, time); -+ } -+} -diff --git a/ras-page-isolation.h b/ras-page-isolation.h -new file mode 100644 -index 0000000..3d03cef ---- /dev/null -+++ b/ras-page-isolation.h -@@ -0,0 +1,66 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+*/ -+ -+#ifndef __RAS_PAGE_ISOLATION_H -+#define __RAS_PAGE_ISOLATION_H -+ -+#include -+#include -+#include "rbtree.h" -+ -+#define PAGE_SHIFT 12 -+#define PAGE_SIZE (1 << PAGE_SHIFT) -+#define PAGE_MASK (~(PAGE_SIZE-1)) -+ -+struct config { -+ char *name; -+ unsigned long val; -+}; -+ -+enum otype { -+ OFFLINE_OFF, -+ OFFLINE_ACCOUNT, -+ OFFLINE_SOFT, -+ OFFLINE_HARD, -+ OFFLINE_SOFT_THEN_HARD, -+}; -+ -+enum pstate { -+ PAGE_ONLINE, -+ PAGE_OFFLINE, -+ PAGE_OFFLINE_FAILED, -+}; -+ -+struct page_record { -+ struct rb_node entry; -+ unsigned long long addr; -+ time_t start; -+ enum pstate offlined; -+ unsigned long count; -+ unsigned long excess; -+}; -+ -+struct isolation { -+ char *name; -+ char *env; -+ const struct config *units; -+ unsigned long val; -+ bool overflow; -+ char *unit; -+}; -+ -+void ras_page_account_init(void); -+void ras_record_page_error(unsigned long long addr, unsigned count, time_t time); -+ -+#endif --- -2.27.0 - diff --git a/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch b/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch deleted file mode 100644 index 9132d67fba2e2dc2f869b09943f4c298c6207c6f..0000000000000000000000000000000000000000 --- a/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch +++ /dev/null @@ -1,259 +0,0 @@ -From 07c3c72d18e5c7da2109b5afa918966733039f13 Mon Sep 17 00:00:00 2001 -From: Bixuan Cui -Date: Sun, 5 Jun 2022 02:10:24 +0800 -Subject: [PATCH] rasdaemon: Add notification support when page goes offline for Memory Corrected Error - -When the page goes offline, it may affect the user's processes. -The user needs to do some special actions (such as restarting the -process) before or after going offline. - -So add page-ce-offline-pre-notice and page-ce-offline-post-notice -to env file of rasdaemon for notifying the user when doing page -offline. - -Signed-off-by: Bixuan Cui ---- - Makefile.am | 2 +- - misc/notices/page-ce-offline-post-notice | 17 +++++ - misc/notices/page-ce-offline-pre-notice | 17 +++++ - misc/rasdaemon.env | 4 ++ - misc/rasdaemon.spec.in | 3 + - ras-page-isolation.c | 90 ++++++++++++++++++++++++ - 6 files changed, 132 insertions(+), 1 deletion(-) - create mode 100755 misc/notices/page-ce-offline-post-notice - create mode 100755 misc/notices/page-ce-offline-pre-notice - -diff --git a/Makefile.am b/Makefile.am -index de76301..701b120 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -1,6 +1,6 @@ - ACLOCAL_AMFLAGS=-I m4 - SUBDIRS = libtrace util man --SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env -+SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env misc/notices - SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) - EXTRA_DIST = $(SYSTEMD_SERVICES_IN) - -diff --git a/misc/notices/page-ce-offline-post-notice b/misc/notices/page-ce-offline-post-notice -new file mode 100755 -index 0000000..d78b1b0 ---- /dev/null -+++ b/misc/notices/page-ce-offline-post-notice -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon after a page goes offline. -+ -+cd `dirname $0` -+ -+[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1 -+ -+if [ -d page-ce-offline-post-notice.extern ] -+then -+ ls page-ce-offline-post-notice.extern | -+ while read item -+ do -+ [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1 -+ done -+fi -+ -+exit 0 -diff --git a/misc/notices/page-ce-offline-pre-notice b/misc/notices/page-ce-offline-pre-notice -new file mode 100755 -index 0000000..d1038a3 ---- /dev/null -+++ b/misc/notices/page-ce-offline-pre-notice -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon before a page goes offline. -+ -+cd `dirname $0` -+ -+[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1 -+ -+if [ -d page-ce-offline-pre-notice.extern ] -+then -+ ls page-ce-offline-pre-notice.extern | -+ while read item -+ do -+ [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1 -+ done -+fi -+ -+exit 0 -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..713875a 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -27,3 +27,7 @@ PAGE_CE_THRESHOLD="50" - # soft-then-hard First try to soft offline, then try hard offlining. - # Note: default offline choice is "soft". - PAGE_CE_ACTION="soft" -+ -+# Notices script when doing memory offline -+PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" -+PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" -diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in -index eff9794..f690575 100644 ---- a/misc/rasdaemon.spec.in -+++ b/misc/rasdaemon.spec.in -@@ -45,6 +45,8 @@ make install DESTDIR=%{buildroot} - install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} - install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service - install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service -+install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ -+install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ - rm INSTALL %{buildroot}/usr/include/*.h - - %files -@@ -56,6 +58,7 @@ rm INSTALL %{buildroot}/usr/include/*.h - %{_sharedstatedir}/rasdaemon - %{_sysconfdir}/ras/dimm_labels.d - %config(noreplace) %{_sysconfdir}/sysconfig/%{name} -+%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* - - %changelog - -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index 50e4406..f4f3bc1 100644 ---- a/ras-page-isolation.c -+++ b/ras-page-isolation.c -@@ -17,9 +17,13 @@ - #include - #include - #include -+#include -+#include -+#include - #include "ras-logger.h" - #include "ras-page-isolation.h" - -+#define MAX_PATH_LEN 64 - #define PARSED_ENV_LEN 50 - static const struct config threshold_units[] = { - { "m", 1000 }, -@@ -73,6 +77,8 @@ static const char *page_state[] = { - - static enum otype offline = OFFLINE_SOFT; - static struct rb_root page_records; -+static char pre_notice[MAX_PATH_LEN]; -+static char post_notice[MAX_PATH_LEN]; - - static void page_offline_init(void) - { -@@ -202,16 +208,94 @@ static void page_isolation_init(void) - threshold_string, cycle_string); - } - -+static void page_notice_init(void) -+{ -+ char *notice_root = "/etc/rasdaemon_notices"; -+ char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); -+ char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); -+ -+ if (offline <= OFFLINE_ACCOUNT) -+ return; -+ -+ snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); -+ if (access(pre_notice, R_OK|X_OK) < 0) -+ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", pre_notice); -+ -+ snprintf(post_notice, sizeof(post_notice), "%s/%s", notice_root, post_re); -+ if (access(post_notice, R_OK|X_OK) < 0) -+ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", post_notice); -+} -+ - void ras_page_account_init(void) - { - page_offline_init(); - page_isolation_init(); -+ page_notice_init(); -+} -+ -+static void finish_child(pid_t child, int status) -+{ -+ if (WIFEXITED(status) && WEXITSTATUS(status)) { -+ log(TERM, LOG_INFO, "notice exited with status %d\n", WEXITSTATUS(status)); -+ } else if (WIFSIGNALED(status)) { -+ log(TERM, LOG_INFO,"notice died with signal %s\n", strsignal(WTERMSIG(status))); -+ } -+ -+ return; -+} -+ -+static void __run_notice(char *argv[], char **env) -+{ -+ pid_t child; -+ int status; -+ -+ child = fork(); -+ if (child < 0) { -+ log(TERM, LOG_ERR, "Cannot create process for offline notice"); -+ return; -+ } -+ if (child == 0) { -+ execve(argv[0], argv, env); -+ _exit(127); -+ } -+ else { -+ waitpid(child, &status, 0); -+ finish_child(child, status); -+ } -+} -+ -+static void run_notice(char *argv[]) -+{ -+ int MAX_ENV = 20; -+ char *env[MAX_ENV]; -+ int ei = 0; -+ int i; -+ -+ asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin"); -+ env[ei] = NULL; -+ assert(ei < MAX_ENV); -+ -+ __run_notice(argv, env); -+ -+ for (i = 0; i < ei; i++) -+ free(env[i]); - } - - static int do_page_offline(unsigned long long addr, enum otype type) - { - FILE *offline_file; - int err; -+ char *args; -+ char *argv[] = { -+ NULL, -+ NULL, -+ NULL, -+ }; -+ -+ asprintf(&args, "%llu", addr); -+ argv[0] = (char*)&pre_notice; -+ argv[1] = args; -+ run_notice(argv); - - offline_file = fopen(kernel_offline[type], "w"); - if (!offline_file) -@@ -221,6 +305,11 @@ static int do_page_offline(unsigned long long addr, enum otype type) - err = ferror(offline_file) ? -1 : 0; - fclose(offline_file); - -+ argv[0] = (char*)&post_notice; -+ run_notice(argv); -+ -+ free(args); -+ - return err; - } - -@@ -329,4 +418,5 @@ void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) - pr->start = time; - page_record(pr, count, time); - } -+ - } --- -2.27.0 - diff --git a/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch b/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch deleted file mode 100644 index 852eb4fd6e82af46103ce03f3af53fefd4578a45..0000000000000000000000000000000000000000 --- a/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch +++ /dev/null @@ -1,85 +0,0 @@ -commit 0862a096c3a1d0f993703ab3299f1ddfadf53d7f -Author: Shiju Jose -Date: Tue Aug 11 13:31:46 2020 +0100 - - rasdaemon: ras-mc-ctl: Add ARM processor error information - - Add supporting ARM processor error in the ras-mc-ctl tool. - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - util/ras-mc-ctl.in | 40 ++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 40 insertions(+) - ---- rasdaemon-0.6.1.orig/util/ras-mc-ctl.in 2021-10-06 14:14:25.000440090 -0400 -+++ rasdaemon-0.6.1/util/ras-mc-ctl.in 2021-10-06 14:15:59.995598590 -0400 -@@ -1124,6 +1124,7 @@ sub summary - my ($query, $query_handle, $out); - my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg); - my ($etype, $severity, $etype_string, $severity_string); -+ my ($affinity, $mpidr); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1159,6 +1160,22 @@ sub summary - } - $query_handle->finish; - -+ # ARM processor arm_event errors -+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($affinity, $mpidr, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count errors\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events summary:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; -+ - # extlog errors - $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; - $query_handle = $dbh->prepare($query); -@@ -1202,6 +1219,7 @@ sub errors - my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); - my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); - my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); -+ my ($error_count, $affinity, $mpidr, $r_state, $psci_state); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1241,6 +1259,28 @@ sub errors - } - $query_handle->finish; - -+ # ARM processor arm_event errors -+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "error_count=$error_count, " if ($error_count); -+ $out .= "affinity_level=$affinity, "; -+ $out .= sprintf "mpidr=0x%x, ", $mpidr; -+ $out .= sprintf "running_state=0x%x, ", $r_state; -+ $out .= sprintf "psci_state=0x%x", $psci_state; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; -+ - # Extlog errors - $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; - $query_handle = $dbh->prepare($query); diff --git a/1001-rasdaemon-avoid-multiple-definitions.patch b/1001-rasdaemon-avoid-multiple-definitions.patch deleted file mode 100644 index 109587bbd52f22b1d72102a745d8558398d44d62..0000000000000000000000000000000000000000 --- a/1001-rasdaemon-avoid-multiple-definitions.patch +++ /dev/null @@ -1,24 +0,0 @@ -commit fd982af0a307edc5d3e56011d2e045015b1efd4b -Author: Mauro Carvalho Chehab -Date: Mon Mar 30 01:22:24 2020 +0200 - - ras-record.h: define an external var as such - - Otherwise, newer versions of gcc will produce multiple symbols, - causing link breakages. - - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/ras-record.h b/ras-record.h -index 5311c67caf44..0d2a481c23dd 100644 ---- a/ras-record.h -+++ b/ras-record.h -@@ -25,7 +25,7 @@ - - extern long user_hz; - --struct ras_events *ras; -+extern struct ras_events *ras; - - struct ras_mc_event { - char timestamp[64]; diff --git a/16d929b024c31d54a7f8a72eab094376c7be27f5.patch b/16d929b024c31d54a7f8a72eab094376c7be27f5.patch deleted file mode 100644 index ab66f52ad79b592fe6da87bd3a4124ad2106ec5b..0000000000000000000000000000000000000000 --- a/16d929b024c31d54a7f8a72eab094376c7be27f5.patch +++ /dev/null @@ -1,32 +0,0 @@ -commit 16d929b024c31d54a7f8a72eab094376c7be27f5 -Author: Mauro Carvalho Chehab -Date: Wed May 26 10:20:39 2021 +0200 - - Makefile.am: fix build header rules - - non-standard-hisilicon.h was added twice; - ras-memory-failure-handler.h is missing. - - Due to that, the tarball becomes incomplete, causing build - errors. - - While here, also adjust .travis.yml to use --enable-all. - - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - ---- a/Makefile.am 2021-10-13 13:27:53.402685179 -0400 -+++ b/Makefile.am 2021-10-13 13:28:11.664525173 -0400 -@@ -54,7 +54,8 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LI - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ -- ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h -+ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ -+ ras-memory-failure-handler.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that diff --git a/1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch b/1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch new file mode 100644 index 0000000000000000000000000000000000000000..99a9ba608c6fc6886c53b316064f1565cf78bd73 --- /dev/null +++ b/1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch @@ -0,0 +1,32 @@ +commit 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4 +Author: Matt Whitlock +Date: Wed Jun 9 10:25:18 2021 -0400 + + configure.ac: fix SYSCONFDEFDIR default value + + configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like: + + # Check whether --with-sysconfdefdir was given. + if test "${with_sysconfdefdir+set}" = set; then : + withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval + else + "/etc/sysconfig" + fi + + This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command. + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/configure.ac b/configure.ac +index f7d1947..33b81fe 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR]) + AC_ARG_WITH(sysconfdefdir, + AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]), + [SYSCONFDEFDIR=$withval], +- ["/etc/sysconfig"]) ++ [SYSCONFDEFDIR=/etc/sysconfig]) + AC_SUBST([SYSCONFDEFDIR]) + + AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database]) diff --git a/2290d65b97311dd5736838f1e285355f7f357046.patch b/2290d65b97311dd5736838f1e285355f7f357046.patch deleted file mode 100644 index 0710974a6350130c1b0827b6fa1dfe264d80868d..0000000000000000000000000000000000000000 --- a/2290d65b97311dd5736838f1e285355f7f357046.patch +++ /dev/null @@ -1,538 +0,0 @@ -commit 2290d65b97311dd5736838f1e285355f7f357046 -Author: Shiju Jose -Date: Mon Mar 8 16:57:26 2021 +0000 - - rasdaemon: add support for memory_failure events - - Add support to log the memory_failure kernel trace - events. - - Example rasdaemon log and SQLite DB output for the - memory_failure event, - ================================================= - rasdaemon: memory_failure_event store: 0x126ce8f8 - rasdaemon: register inserted at db - <...>-785 [000] 0.000024: memory_failure_event: 2020-10-02 13:27:13 -0400 pfn=0x204000000 page_type=free buddy page action_result=Delayed - - CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp TEXT, pfn TEXT, page_type TEXT, action_result TEXT); - INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13 -0400','0x204000000','free buddy page','Delayed'); - ================================================== - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 4 - ras-events.c | 15 +++ - ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++++++++++ - ras-memory-failure-handler.h | 25 ++++++ - ras-record.c | 56 +++++++++++++ - ras-record.h | 13 +++ - ras-report.c | 68 ++++++++++++++++ - ras-report.h | 5 - - 8 files changed, 364 insertions(+), 1 deletion(-) - ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ b/ras-memory-failure-handler.c 2021-10-14 16:31:36.840657728 -0400 -@@ -0,0 +1,179 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+#include -+#include "libtrace/kbuffer.h" -+#include "ras-memory-failure-handler.h" -+#include "ras-record.h" -+#include "ras-logger.h" -+#include "ras-report.h" -+ -+/* Memory failure - various types of pages */ -+enum mf_action_page_type { -+ MF_MSG_KERNEL, -+ MF_MSG_KERNEL_HIGH_ORDER, -+ MF_MSG_SLAB, -+ MF_MSG_DIFFERENT_COMPOUND, -+ MF_MSG_POISONED_HUGE, -+ MF_MSG_HUGE, -+ MF_MSG_FREE_HUGE, -+ MF_MSG_NON_PMD_HUGE, -+ MF_MSG_UNMAP_FAILED, -+ MF_MSG_DIRTY_SWAPCACHE, -+ MF_MSG_CLEAN_SWAPCACHE, -+ MF_MSG_DIRTY_MLOCKED_LRU, -+ MF_MSG_CLEAN_MLOCKED_LRU, -+ MF_MSG_DIRTY_UNEVICTABLE_LRU, -+ MF_MSG_CLEAN_UNEVICTABLE_LRU, -+ MF_MSG_DIRTY_LRU, -+ MF_MSG_CLEAN_LRU, -+ MF_MSG_TRUNCATED_LRU, -+ MF_MSG_BUDDY, -+ MF_MSG_BUDDY_2ND, -+ MF_MSG_DAX, -+ MF_MSG_UNSPLIT_THP, -+ MF_MSG_UNKNOWN, -+}; -+ -+/* Action results for various types of pages */ -+enum mf_action_result { -+ MF_IGNORED, /* Error: cannot be handled */ -+ MF_FAILED, /* Error: handling failed */ -+ MF_DELAYED, /* Will be handled later */ -+ MF_RECOVERED, /* Successfully recovered */ -+}; -+ -+/* memory failure page types */ -+static const struct { -+ int type; -+ const char *page_type; -+} mf_page_type[] = { -+ { MF_MSG_KERNEL, "reserved kernel page" }, -+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"}, -+ { MF_MSG_SLAB, "kernel slab page"}, -+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"}, -+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"}, -+ { MF_MSG_HUGE, "huge page"}, -+ { MF_MSG_FREE_HUGE, "free huge page"}, -+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"}, -+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"}, -+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"}, -+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"}, -+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"}, -+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"}, -+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"}, -+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"}, -+ { MF_MSG_DIRTY_LRU, "dirty LRU page"}, -+ { MF_MSG_CLEAN_LRU, "clean LRU page"}, -+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"}, -+ { MF_MSG_BUDDY, "free buddy page"}, -+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"}, -+ { MF_MSG_DAX, "dax page"}, -+ { MF_MSG_UNSPLIT_THP, "unsplit thp"}, -+ { MF_MSG_UNKNOWN, "unknown page"}, -+}; -+ -+/* memory failure action results */ -+static const struct { -+ int result; -+ const char *action_result; -+} mf_action_result[] = { -+ { MF_IGNORED, "Ignored" }, -+ { MF_FAILED, "Failed" }, -+ { MF_DELAYED, "Delayed" }, -+ { MF_RECOVERED, "Recovered" }, -+}; -+ -+static const char *get_page_type(int page_type) -+{ -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++) -+ if (mf_page_type[i].type == page_type) -+ return mf_page_type[i].page_type; -+ -+ return "unknown page"; -+} -+ -+static const char *get_action_result(int result) -+{ -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++) -+ if (mf_action_result[i].result == result) -+ return mf_action_result[i].action_result; -+ -+ return "unknown"; -+} -+ -+ -+int ras_memory_failure_event_handler(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, void *context) -+{ -+ unsigned long long val; -+ struct ras_events *ras = context; -+ time_t now; -+ struct tm *tm; -+ struct ras_mf_event ev; -+ -+ /* -+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. -+ * On previous kernels, the way to properly generate an event would -+ * be to inject a fake one, measure its timestamp and diff it against -+ * gettimeofday. We won't do it here. Instead, let's use uptime, -+ * falling-back to the event report's time, if "uptime" clock is -+ * not available (legacy kernels). -+ */ -+ -+ if (ras->use_uptime) -+ now = record->ts/user_hz + ras->uptime_diff; -+ else -+ now = time(NULL); -+ -+ tm = localtime(&now); -+ if (tm) -+ strftime(ev.timestamp, sizeof(ev.timestamp), -+ "%Y-%m-%d %H:%M:%S %z", tm); -+ trace_seq_printf(s, "%s ", ev.timestamp); -+ -+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0) -+ return -1; -+ sprintf(ev.pfn, "0x%llx", val); -+ trace_seq_printf(s, "pfn=0x%llx ", val); -+ -+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0) -+ return -1; -+ ev.page_type = get_page_type(val); -+ trace_seq_printf(s, "page_type=%s ", ev.page_type); -+ -+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0) -+ return -1; -+ ev.action_result = get_action_result(val); -+ trace_seq_printf(s, "action_result=%s ", ev.action_result); -+ -+ /* Store data into the SQLite DB */ -+#ifdef HAVE_SQLITE3 -+ ras_store_mf_event(ras, &ev); -+#endif -+ -+#ifdef HAVE_ABRT_REPORT -+ /* Report event to ABRT */ -+ ras_report_mf_event(ras, &ev); -+#endif -+ -+ return 0; -+} ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ b/ras-memory-failure-handler.h 2021-10-14 16:31:36.840657728 -0400 -@@ -0,0 +1,25 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+*/ -+ -+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H -+#define __RAS_MEMORY_FAILURE_HANDLER_H -+ -+#include "ras-events.h" -+#include "libtrace/event-parse.h" -+ -+int ras_memory_failure_event_handler(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, void *context); -+ -+#endif ---- a/ras-record.c 2018-04-25 06:19:03.000000000 -0400 -+++ b/ras-record.c 2021-10-14 16:31:36.840657728 -0400 -@@ -404,6 +404,55 @@ sqlite3_bind_text(priv->stmt_mce_record, - } - #endif - -+/* -+ * Table and functions to handle ras:memory_failure -+ */ -+ -+#ifdef HAVE_MEMORY_FAILURE -+static const struct db_fields mf_event_fields[] = { -+ { .name="id", .type="INTEGER PRIMARY KEY" }, -+ { .name="timestamp", .type="TEXT" }, -+ { .name="pfn", .type="TEXT" }, -+ { .name="page_type", .type="TEXT" }, -+ { .name="action_result", .type="TEXT" }, -+}; -+ -+static const struct db_table_descriptor mf_event_tab = { -+ .name = "memory_failure_event", -+ .fields = mf_event_fields, -+ .num_fields = ARRAY_SIZE(mf_event_fields), -+}; -+ -+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) -+{ -+ int rc; -+ struct sqlite3_priv *priv = ras->db_priv; -+ -+ if (!priv || !priv->stmt_mf_event) -+ return 0; -+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event); -+ -+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL); -+ -+ rc = sqlite3_step(priv->stmt_mf_event); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc); -+ -+ rc = sqlite3_reset(priv->stmt_mf_event); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed reset memory_failure_event on sqlite: error = %d\n", -+ rc); -+ -+ log(TERM, LOG_INFO, "register inserted at db\n"); -+ -+ return rc; -+} -+#endif - - /* - * Generic code -@@ -567,6 +616,13 @@ usleep(10000); - rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record, - &arm_event_tab); - #endif -+#ifdef HAVE_MEMORY_FAILURE -+ rc = ras_mc_create_table(priv, &mf_event_tab); -+ if (rc == SQLITE_OK) { -+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event, -+ &mf_event_tab); -+ } -+#endif - - ras->db_priv = priv; - return 0; ---- a/ras-record.h 2018-04-25 06:19:03.000000000 -0400 -+++ b/ras-record.h 2021-10-14 16:31:36.840657728 -0400 -@@ -75,12 +75,20 @@ struct ras_arm_event { - int32_t psci_state; - }; - -+struct ras_mf_event { -+ char timestamp[64]; -+ char pfn[30]; -+ const char *page_type; -+ const char *action_result; -+}; -+ - struct ras_mc_event; - struct ras_aer_event; - struct ras_extlog_event; - struct ras_non_standard_event; - struct ras_arm_event; - struct mce_event; -+struct ras_mf_event; - - #ifdef HAVE_SQLITE3 - -@@ -104,6 +112,9 @@ struct sqlite3_priv { - #ifdef HAVE_ARM - sqlite3_stmt *stmt_arm_record; - #endif -+#ifdef HAVE_MEMORY_FAILURE -+ sqlite3_stmt *stmt_mf_event; -+#endif - }; - - int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); -@@ -113,6 +124,7 @@ int ras_store_mce_record(struct ras_even - int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev); - int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev); - int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev); -+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); - - #else - static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; -@@ -122,6 +134,7 @@ static inline int ras_store_mce_record(s - static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; }; - static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; - static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; -+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; - - #endif - ---- a/ras-report.c 2017-10-14 05:11:34.000000000 -0400 -+++ b/ras-report.c 2021-10-14 16:31:36.840657728 -0400 -@@ -255,6 +255,28 @@ "midr=0x%lx\n" \ - return 0; - } - -+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev) -+{ -+ char bt_buf[MAX_BACKTRACE_SIZE]; -+ -+ if (!buf || !ev) -+ return -1; -+ -+ sprintf(bt_buf, "BACKTRACE=" \ -+ "timestamp=%s\n" \ -+ "pfn=%s\n" \ -+ "page_type=%s\n" \ -+ "action_result=%s\n", \ -+ ev->timestamp, \ -+ ev->pfn, \ -+ ev->page_type, \ -+ ev->action_result); -+ -+ strcat(buf, bt_buf); -+ -+ return 0; -+} -+ - static int commit_report_backtrace(int sockfd, int type, void *ev){ - char buf[MAX_BACKTRACE_SIZE]; - char *pbuf = buf; -@@ -283,6 +305,9 @@ memset(buf, 0, MAX_BACKTRACE_SIZE); - case ARM_EVENT: - rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev); - break; -+ case MF_EVENT: -+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev); -+ break; - default: - return -1; - } -@@ -549,3 +574,46 @@ return 0; - return -1; - } - } -+ -+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) -+{ -+ char buf[MAX_MESSAGE_SIZE]; -+ int sockfd = 0; -+ int done = 0; -+ int rc = -1; -+ -+ memset(buf, 0, sizeof(buf)); -+ -+ sockfd = setup_report_socket(); -+ if (sockfd < 0) -+ return -1; -+ -+ rc = commit_report_basic(sockfd); -+ if (rc < 0) -+ goto mf_fail; -+ -+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev); -+ if (rc < 0) -+ goto mf_fail; -+ -+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure"); -+ rc = write(sockfd, buf, strlen(buf) + 1); -+ if (rc < strlen(buf) + 1) -+ goto mf_fail; -+ -+ sprintf(buf, "REASON=%s", "memory failure problem"); -+ rc = write(sockfd, buf, strlen(buf) + 1); -+ if (rc < strlen(buf) + 1) -+ goto mf_fail; -+ -+ done = 1; -+ -+mf_fail: -+ if (sockfd > 0) -+ close(sockfd); -+ -+ if (done) -+ return 0; -+ else -+ return -1; -+} ---- a/ras-report.h 2017-10-14 05:11:34.000000000 -0400 -+++ b/ras-report.h 2021-10-14 16:31:36.840657728 -0400 -@@ -34,7 +34,8 @@ enum { - MCE_EVENT, - AER_EVENT, - NON_STANDARD_EVENT, -- ARM_EVENT -+ ARM_EVENT, -+ MF_EVENT, - }; - - #ifdef HAVE_ABRT_REPORT -@@ -44,6 +45,7 @@ int ras_report_aer_event(struct ras_even - int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev); - int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev); - int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev); -+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); - - #else - -@@ -52,6 +54,7 @@ static inline int ras_report_aer_event(s - static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; }; - static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; - static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; -+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; - - #endif - ---- a/Makefile.am 2018-04-25 06:21:56.000000000 -0400 -+++ b/Makefile.am 2021-10-14 16:37:42.423639762 -0400 -@@ -41,12 +41,16 @@ endif - if WITH_EXTLOG - rasdaemon_SOURCES += ras-extlog-handler.c - endif -+if WITH_MEMORY_FAILURE -+ rasdaemon_SOURCES += ras-memory-failure-handler.c -+endif - if WITH_ABRT_REPORT - rasdaemon_SOURCES += ras-report.c - endif - if WITH_HISI_NS_DECODE - rasdaemon_SOURCES += non-standard-hisi_hip07.c - endif -+ - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ---- a/ras-events.c 2021-10-14 16:31:36.730658636 -0400 -+++ b/ras-events.c 2021-10-14 16:37:11.043898809 -0400 -@@ -33,6 +33,7 @@ * Foundation, Inc., 51 Franklin Street, - #include "ras-arm-handler.h" - #include "ras-mce-handler.h" - #include "ras-extlog-handler.h" -+#include "ras-memory-failure-handler.h" - #include "ras-record.h" - #include "ras-logger.h" - -@@ -218,6 +219,10 @@ if (rc < 0) { - rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable); - #endif - -+#ifdef HAVE_MEMORY_FAILURE -+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable); -+#endif -+ - free_ras: - free(ras); - return rc; -@@ -736,6 +741,16 @@ (void)open("/sys/kernel/debug/ras/daemon - "ras", "aer_event"); - #endif - -+#ifdef HAVE_MEMORY_FAILURE -+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event", -+ ras_memory_failure_event_handler); -+ if (!rc) -+ num_events++; -+ else -+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", -+ "ras", "memory_failure_event"); -+#endif -+ - if (!num_events) { - log(ALL, LOG_INFO, - "Failed to trace all supported RAS events. Aborting.\n"); diff --git a/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch b/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch deleted file mode 100644 index 1b5844d975c9998a7aab647144fb0c7d80876c6d..0000000000000000000000000000000000000000 --- a/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch +++ /dev/null @@ -1,66 +0,0 @@ -commit 2a1d217660351c08eb2f8bccebf939abba2f7e69 -Author: Brian WoodsGhannam, Yazen -Date: Fri Nov 1 15:48:13 2019 +0100 - - rasdaemon: rename CPU_NAPLES cputype - - Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES - that is supported, but AMD's Scalable Machine Check Architecture (SMCA). - - [ Yazen: change family check to feature check, and change CPU name. ] - - CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype - Signed-off-by: Brian Woods - Signed-off-by: Yazen Ghannam - Cc: Chandu-babu Namburu - Signed-off-by: Mauro Carvalho Chehab - ---- - ras-mce-handler.c | 10 ++++++---- - ras-mce-handler.h | 2 +- - 2 files changed, 7 insertions(+), 5 deletions(-) - ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400 -@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_LANDING] = "Knights Landing", - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", -- [CPU_NAPLES] = "AMD Family 17h Zen1" -+ [CPU_AMD_SMCA] = "AMD Scalable MCA", - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -191,8 +191,10 @@ ret = 0; - if (!strcmp(mce->vendor, "AuthenticAMD")) { - if (mce->family == 15) - mce->cputype = CPU_K8; -- if (mce->family == 23) -- mce->cputype = CPU_NAPLES; -+ if (strstr(mce->processor_flags, "smca")) { -+ mce->cputype = CPU_AMD_SMCA; -+ goto ret; -+ } - if (mce->family > 23) { - log(ALL, LOG_INFO, - "Can't parse MCE for this AMD CPU yet %d\n", -@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid - case CPU_K8: - rc = parse_amd_k8_event(ras, &e); - break; -- case CPU_NAPLES: -+ case CPU_AMD_SMCA: - rc = parse_amd_smca_event(ras, &e); - break; - default: /* All other CPU types are Intel */ ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400 -@@ -50,7 +50,7 @@ enum cputype { - CPU_KNIGHTS_LANDING, - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, -- CPU_NAPLES, -+ CPU_AMD_SMCA, - }; - - struct mce_event { diff --git a/2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch b/2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch new file mode 100644 index 0000000000000000000000000000000000000000..eb45db0c742249cc08ba267cfcb04b4decb4f5c0 --- /dev/null +++ b/2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch @@ -0,0 +1,63 @@ +commit 2b37a26dcec389723f75d69d3da9c2f15f6c317d +Author: Mauro Carvalho Chehab +Date: Wed May 26 12:41:27 2021 +0200 + + ci.yml: Fix the job for it to run on a single arch + + There were some issues on the previous content. Fix them, in + order to allow it to build on a single architecture. + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml +index 5b3e757..747a844 100644 +--- a/.github/workflows/ci.yml ++++ b/.github/workflows/ci.yml +@@ -1,34 +1,23 @@ + name: CI + +-# Should run only on branches and PR, as "on_tag.yml" will handle tags + on: ++ workflow_dispatch: + push: +- branches: master test + pull_request: +- branches: master + + jobs: +- +-# +-# Linux +-# + Ubuntu: + name: Ubuntu +- runs-on: ubuntu-20.04 +- strategy: +- matrix: +- arch: [x64_64, aarch64, armv7, ppc64le] ++ runs-on: ubuntu-latest + steps: +- - uses: actions/checkout@v2 +- with: +- arch: ${{ matrix.arch }} +- - name: prepare +- run: | +- sudo apt-get update +- sudo apt-get install -y build-essential sqlite3 +- - name: build +- run: | +- autoreconf -vfi +- ./configure --enable-all +- make +- sudo make install ++ - uses: actions/checkout@v2 ++ - name: prepare ++ run: | ++ sudo apt-get update ++ sudo apt-get install -y build-essential sqlite3 ++ - name: build ++ run: | ++ autoreconf -vfi ++ ./configure --enable-all ++ make ++ sudo make install diff --git a/2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch b/2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch new file mode 100644 index 0000000000000000000000000000000000000000..c2a9376e45ab2c678a1f2d09af5a423a06454b16 --- /dev/null +++ b/2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch @@ -0,0 +1,44 @@ +commit 2b6a54b0d31e02e657171fd27f4e31d996756bc6 +Author: DmNosachev +Date: Thu Jul 22 10:25:38 2021 +0300 + + labels/supermicro: added Supermicro X10DRL, X11SPM + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 1e7761f..990fc9e 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -88,6 +88,16 @@ Vendor: Supermicro + P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1; + P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1; + P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; ++ ++ Model: X10DRL-i ++ P1-DIMMA1: 0.0.0; ++ P1-DIMMB1: 0.1.0; ++ P1-DIMMC1: 0.2.0; ++ P1-DIMMD1: 0.3.0; ++ P2-DIMME1: 1.0.0; ++ P2-DIMMF1: 1.1.0; ++ P2-DIMMG1: 1.2.0; ++ P2-DIMMH1: 1.3.0; + + Model: X11DDW-NT, X11DDW-L + P1-DIMMA1: 0.0.0; +@@ -102,6 +112,14 @@ Vendor: Supermicro + P2-DIMMD1: 3.0.0; + P2-DIMME1: 3.1.0; + P2-DIMMF1: 3.2.0; ++ ++ Model: X11SPM-F, X11SPM-TF, X11SPM-TPF ++ DIMMA1: 0.0.0; ++ DIMMB1: 0.1.0; ++ DIMMC1: 0.2.0; ++ DIMMD1: 1.0.0; ++ DIMME1: 1.1.0; ++ DIMMF1: 1.2.0; + + Model: B1DRi + P1_DIMMA1: 0.0.0; diff --git a/50565005b10fe909c66f1c90f2feb95712427c7d.patch b/50565005b10fe909c66f1c90f2feb95712427c7d.patch new file mode 100644 index 0000000000000000000000000000000000000000..dba01162fea30d9d446568226277d7eb5049e18e --- /dev/null +++ b/50565005b10fe909c66f1c90f2feb95712427c7d.patch @@ -0,0 +1,43 @@ +commit 50565005b10fe909c66f1c90f2feb95712427c7d +Author: DmNosachev +Date: Tue Jun 29 14:07:54 2021 +0300 + + labels/supermicro: added Supermicro X11DDW-NT(-L) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 86e4617..373de07 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -69,7 +69,7 @@ Vendor: Supermicro + P2_DIMM4B: 2.0.1; + P2_DIMM4B: 2.1.1; + +- Model: X11DPH-i ++ Model: X11DPH-i, X11DPH-T, X11DPH-TQ + P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; + P1-DIMMB1: 0.1.0; + P1-DIMMC1: 0.2.0; +@@ -91,4 +91,18 @@ Vendor: Supermicro + P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1; + P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1; + P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1; +- P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; +\ No newline at end of file ++ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; ++ ++ Model: X11DDW-NT, X11DDW-L ++ P1-DIMMA1: 0.0.0; ++ P1-DIMMB1: 0.1.0; ++ P1-DIMMC1: 0.2.0; ++ P1-DIMMD1: 1.0.0; ++ P1-DIMME1: 1.1.0; ++ P1-DIMMF1: 1.2.0; ++ P2-DIMMA1: 2.0.0; ++ P2-DIMMB1: 2.1.0; ++ P2-DIMMC1: 2.2.0; ++ P2-DIMMD1: 3.0.0; ++ P2-DIMME1: 3.1.0; ++ P2-DIMMF1: 3.2.0; +\ No newline at end of file diff --git a/546cf713f667437fb6e283cc3dc090679eb47d08.patch b/546cf713f667437fb6e283cc3dc090679eb47d08.patch deleted file mode 100644 index 448b1f6498aa88aba61848e5ab0cfcdeaf1ddb2a..0000000000000000000000000000000000000000 --- a/546cf713f667437fb6e283cc3dc090679eb47d08.patch +++ /dev/null @@ -1,372 +0,0 @@ -commit 546cf713f667437fb6e283cc3dc090679eb47d08 -Author: Subhendu Saha -Date: Tue Jan 12 03:29:55 2021 -0500 - - Fix ras-mc-ctl script. - - When rasdaemon is compiled without enabling aer, mce, devlink, - etc., those tables are not created in the database file. Then - ras-mc-ctl script breaks trying to query data from non-existent - tables. - - Signed-off-by: Subhendu Saha subhends@akamai.com - Signed-off-by: Mauro Carvalho Chehab - ---- - util/ras-mc-ctl.in | 310 ++++++++++++++++++++++++++++------------------------- - 1 file changed, 168 insertions(+), 142 deletions(-) - ---- a/util/ras-mc-ctl.in 2021-10-12 13:45:43.260646935 -0400 -+++ b/util/ras-mc-ctl.in 2021-10-12 13:46:38.610158949 -0400 -@@ -41,6 +41,16 @@ my $sysconfdir = "@sysconfdir@"; - my $dmidecode = find_prog ("dmidecode"); - my $modprobe = find_prog ("modprobe") or exit (1); - -+my $has_aer = 0; -+my $has_arm = 0; -+my $has_extlog = 0; -+my $has_mce = 0; -+ -+@WITH_AER_TRUE@$has_aer = 1; -+@WITH_ARM_TRUE@$has_arm = 1; -+@WITH_EXTLOG_TRUE@$has_extlog = 1; -+@WITH_MCE_TRUE@$has_mce = 1; -+ - my %conf = (); - my %bus = (); - my %dimm_size = (); -@@ -1145,70 +1155,78 @@ sub summary - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($err_type, $msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $err_type errors: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events summary:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($err_type, $msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $err_type errors: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events summary:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # ARM processor arm_event errors -- $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($affinity, $mpidr, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count errors\n"; -- } -- if ($out ne "") { -- print "ARM processor events summary:\n$out\n"; -- } else { -- print "No ARM processor errors.\n\n"; -+ if ($has_arm == 1) { -+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($affinity, $mpidr, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count errors\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events summary:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # extlog errors -- $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($etype, $severity, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "\t$count $etype_string $severity_string errors\n"; -- } -- if ($out ne "") { -- print "Extlog records summary:\n$out"; -- } else { -- print "No Extlog errors.\n"; -+ if ($has_extlog == 1) { -+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($etype, $severity, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "\t$count $etype_string $severity_string errors\n"; -+ } -+ if ($out ne "") { -+ print "Extlog records summary:\n$out"; -+ } else { -+ print "No Extlog errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select error_msg, count(*) from mce_record group by error_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $msg errors\n"; -- } -- if ($out ne "") { -- print "MCE records summary:\n$out"; -- } else { -- print "No MCE errors.\n"; -+ if ($has_mce == 1) { -+ $query = "select error_msg, count(*) from mce_record group by error_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $msg errors\n"; -+ } -+ if ($out ne "") { -+ print "MCE records summary:\n$out"; -+ } else { -+ print "No MCE errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } -@@ -1244,105 +1262,113 @@ sub errors - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $type, $msg)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time $type error: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $type, $msg)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time $type error: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # ARM processor arm_event errors -- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $timestamp error: "; -- $out .= "error_count=$error_count, " if ($error_count); -- $out .= "affinity_level=$affinity, "; -- $out .= sprintf "mpidr=0x%x, ", $mpidr; -- $out .= sprintf "running_state=0x%x, ", $r_state; -- $out .= sprintf "psci_state=0x%x", $psci_state; -- $out .= "\n"; -- } -- if ($out ne "") { -- print "ARM processor events:\n$out\n"; -- } else { -- print "No ARM processor errors.\n\n"; -+ if ($has_arm == 1) { -+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "error_count=$error_count, " if ($error_count); -+ $out .= "affinity_level=$affinity, "; -+ $out .= sprintf "mpidr=0x%x, ", $mpidr; -+ $out .= sprintf "running_state=0x%x, ", $r_state; -+ $out .= sprintf "psci_state=0x%x", $psci_state; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # Extlog errors -- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "$id $timestamp error: "; -- $out .= "type=$etype_string, "; -- $out .= "severity=$severity_string, "; -- $out .= sprintf "address=0x%08x, ", $addr; -- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -- $out .= "fru_text='$fru_text', "; -- $out .= get_cper_data_text($cper_data) if ($cper_data); -- $out .= "\n"; -- } -- if ($out ne "") { -- print "Extlog events:\n$out\n"; -- } else { -- print "No Extlog errors.\n\n"; -+ if ($has_extlog) { -+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "$id $timestamp error: "; -+ $out .= "type=$etype_string, "; -+ $out .= "severity=$severity_string, "; -+ $out .= sprintf "address=0x%08x, ", $addr; -+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -+ $out .= "fru_text='$fru_text', "; -+ $out .= get_cper_data_text($cper_data) if ($cper_data); -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "Extlog events:\n$out\n"; -+ } else { -+ print "No Extlog errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time error: $msg"; -- $out .= ", CPU $cpuvendor" if ($cpuvendor); -- $out .= ", bank $bank_name" if ($bank_name); -- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -- $out .= ", $mc_location" if ($mc_location); -- $out .= ", $user_action" if ($user_action); -- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -- $out .= sprintf ", status=0x%08x", $status if ($status); -- $out .= sprintf ", addr=0x%08x", $addr if ($addr); -- $out .= sprintf ", misc=0x%08x", $misc if ($misc); -- $out .= sprintf ", ip=0x%08x", $ip if ($ip); -- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -- $out .= sprintf ", cs=0x%08x", $cs if ($cs); -- $out .= sprintf ", bank=0x%08x", $bank if ($bank); -+ if ($has_mce == 1) { -+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time error: $msg"; -+ $out .= ", CPU $cpuvendor" if ($cpuvendor); -+ $out .= ", bank $bank_name" if ($bank_name); -+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -+ $out .= ", $mc_location" if ($mc_location); -+ $out .= ", $user_action" if ($user_action); -+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -+ $out .= sprintf ", status=0x%08x", $status if ($status); -+ $out .= sprintf ", addr=0x%08x", $addr if ($addr); -+ $out .= sprintf ", misc=0x%08x", $misc if ($misc); -+ $out .= sprintf ", ip=0x%08x", $ip if ($ip); -+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -+ $out .= sprintf ", cs=0x%08x", $cs if ($cs); -+ $out .= sprintf ", bank=0x%08x", $bank if ($bank); - -- $out .= "\n"; -- } -- if ($out ne "") { -- print "MCE events:\n$out\n"; -- } else { -- print "No MCE errors.\n\n"; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "MCE events:\n$out\n"; -+ } else { -+ print "No MCE errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } diff --git a/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch b/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch deleted file mode 100644 index 57a4e466ed319ab95060efa27f99a7e423594754..0000000000000000000000000000000000000000 --- a/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch +++ /dev/null @@ -1,149 +0,0 @@ -commit 60a91e4da4f2daf2b10143fc148a8043312b61e5 -Author: Aristeu Rozanski -Date: Wed Aug 1 16:29:58 2018 -0400 - - rasdaemon: ras-mc-ctl: add option to show error counts - - In some scenarios it might not be desirable to have a daemon running - to parse and store the errors provided by EDAC and only having the - number of CEs and UEs is enough. This patch implements this feature - as an ras-mc-ctl option. - - Signed-off-by: Aristeu Rozanski - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 38b7824..aee431a 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -50,6 +50,8 @@ my %dimm_location = (); - my %csrow_size = (); - my %rank_size = (); - my %csrow_ranks = (); -+my %dimm_ce_count = (); -+my %dimm_ue_count = (); - - my @layers; - my @max_pos; -@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...] - --layout Display the memory layout. - --summary Presents a summary of the logged errors. - --errors Shows the errors stored at the error database. -+ --error-count Shows the corrected and uncorrected error counts using sysfs. - --help This help message. - EOF - -@@ -83,7 +86,7 @@ parse_cmdline(); - - if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} - || $conf{opt}{register_labels} || $conf{opt}{display_memory_layout} -- || $conf{opt}{guess_dimm_label}) { -+ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) { - - get_mainboard_info(); - -@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} - if ($conf{opt}{guess_dimm_label}) { - guess_dimm_label (); - } -+ if ($conf{opt}{error_count}) { -+ display_error_count (); -+ } - } - - if ($conf{opt}{status}) { -@@ -134,6 +140,7 @@ sub parse_cmdline - $conf{opt}{guess_dimm_label} = 0; - $conf{opt}{summary} = 0; - $conf{opt}{errors} = 0; -+ $conf{opt}{error_count} = 0; - - my $rref = \$conf{opt}{report}; - my $mref = \$conf{opt}{mainboard}; -@@ -150,7 +157,8 @@ sub parse_cmdline - "status" => \$conf{opt}{status}, - "layout" => \$conf{opt}{display_memory_layout}, - "summary" => \$conf{opt}{summary}, -- "errors" => \$conf{opt}{errors} -+ "errors" => \$conf{opt}{errors}, -+ "error-count" => \$conf{opt}{error_count} - ); - - usage(1) if !$rc; -@@ -284,6 +292,30 @@ sub parse_dimm_nodes - $dimm_label_file{$str_loc} = $file; - $dimm_location{$str_loc} = $location; - -+ my $count; -+ -+ $file =~s/dimm_label/dimm_ce_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ce_count{$str_loc} = $count; -+ -+ $file =~s/dimm_ce_count/dimm_ue_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ue_count{$str_loc} = $count; -+ - return; - } - } -@@ -906,6 +938,45 @@ sub display_memory_layout - dimm_display_mem(); - } - -+sub display_error_count -+{ -+ my $sysfs_dir = "/sys/devices/system/edac/mc"; -+ my $key; -+ my $max_width = 0; -+ my %dimm_labels = (); -+ -+ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir); -+ -+ if (!scalar(keys %dimm_node)) { -+ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n"); -+ exit -1; -+ } -+ -+ foreach $key (keys %dimm_node) { -+ my $label_width; -+ -+ open IN, $dimm_label_file{$key}; -+ chomp(my $label = ); -+ close IN; -+ $label_width = length $label; -+ -+ if ($label_width > $max_width) { -+ $max_width = $label_width; -+ } -+ $dimm_labels{$key} = $label; -+ } -+ my $string = "Label"; -+ $string .= " " x ($max_width - length $string); -+ print($string . "\tCE\tUE\n"); -+ -+ foreach $key (keys %dimm_node) { -+ my $ce_count = $dimm_ce_count{$key}; -+ my $ue_count = $dimm_ue_count{$key}; -+ -+ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n"); -+ } -+} -+ - sub find_prog - { - my ($file) = @_; diff --git a/6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch b/6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch new file mode 100644 index 0000000000000000000000000000000000000000..2d3bd32a232bb3e84e361a2f8d46921f509dbc57 --- /dev/null +++ b/6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch @@ -0,0 +1,37 @@ +commit 6bc43db1b6b3d73805179c21d1dd5521e8dc0f74 +Author: DmNosachev +Date: Fri Jul 2 13:13:46 2021 +0300 + + labels/supermicro: added Supermicro X11SCA(-F) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index b924a32..1e7761f 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -10,11 +10,7 @@ + # + + Vendor: Supermicro +- Model: A2SDi-8C-HLN4F +- DIMMA1: 0.0.0; DIMMA2: 0.0.1; +- DIMMB1: 0.1.0; DIMMB2: 0.1.1; +- +- Model: A2SDi-8C+-HLN4F ++ Model: A2SDi-8C-HLN4F, A2SDi-8C+-HLN4F + DIMMA1: 0.0.0; DIMMA2: 0.0.1; + DIMMB1: 0.1.0; DIMMB2: 0.1.1; + +@@ -115,4 +111,8 @@ Vendor: Supermicro + P2_DIMME1: 1.0.0; + P2_DIMMF1: 1.1.0; + P2_DIMMG1: 1.2.0; +- P2_DIMMH1: 1.3.0; +\ No newline at end of file ++ P2_DIMMH1: 1.3.0; ++ ++ Model: X11SCA, X11SCA-F ++ DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0; ++ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1; +\ No newline at end of file diff --git a/738bafafdcb2e8b0ced32fff31b13754d571090b.patch b/738bafafdcb2e8b0ced32fff31b13754d571090b.patch new file mode 100644 index 0000000000000000000000000000000000000000..a3ba3248032d547870d5de44f6d0467fe92e6efc --- /dev/null +++ b/738bafafdcb2e8b0ced32fff31b13754d571090b.patch @@ -0,0 +1,610 @@ +commit 738bafafdcb2e8b0ced32fff31b13754d571090b +Author: Jason Tian +Date: Fri May 28 11:35:43 2021 +0800 + + Add error handling for Ampere-specific errors. + + Save Ampere-specific errors' decode into sqlite3 data + base and log PCIe segment, bus/device/function number + into BMC SEL. + + Signed-off-by: Jason Tian + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/non-standard-ampere.c b/non-standard-ampere.c +index 8cceb26..05b5252 100644 +--- a/non-standard-ampere.c ++++ b/non-standard-ampere.c +@@ -216,6 +216,13 @@ static const char * const err_bert_sub_type[] = { + "PMPRO Fatal", + }; + ++static char *sqlite3_table_list[] = { ++ "amp_payload0_event_tab", ++ "amp_payload1_event_tab", ++ "amp_payload2_event_tab", ++ "amp_payload3_event_tab", ++}; ++ + struct amp_ras_type_info { + int id; + const char *name; +@@ -352,6 +359,359 @@ static const char *oem_subtype_name(const struct amp_ras_type_info *info, + return "unknown"; + } + ++#ifdef HAVE_SQLITE3 ++/*key pair definition for ampere specific error payload type 0*/ ++static const struct db_fields amp_payload0_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "status_reg", .type = "INTEGER" }, ++ { .name = "addr_reg", .type = "INTEGER" }, ++ { .name = "misc0", .type = "INTEGER" }, ++ { .name = "misc1", .type = "INTEGER" }, ++ { .name = "misc2", .type = "INTEGER" }, ++ { .name = "misc3", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload0_event_tab = { ++ .name = "amp_payload0_event", ++ .fields = amp_payload0_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload0_event_fields), ++}; ++ ++/*key pair definition for ampere specific error payload type 1*/ ++static const struct db_fields amp_payload1_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "uncore_err_status", .type = "INTEGER" }, ++ { .name = "uncore_err_mask", .type = "INTEGER" }, ++ { .name = "uncore_err_sev", .type = "INTEGER" }, ++ { .name = "core_err_status", .type = "INTEGER" }, ++ { .name = "core_err_mask", .type = "INTEGER" }, ++ { .name = "root_err_cmd", .type = "INTEGER" }, ++ { .name = "root_err_status", .type = "INTEGER" }, ++ { .name = "src_id", .type = "INTEGER" }, ++ { .name = "reserved1", .type = "INTEGER" }, ++ { .name = "reserverd2", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload1_event_tab = { ++ .name = "amp_payload1_event", ++ .fields = amp_payload1_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload1_event_fields), ++}; ++ ++/*key pair definition for ampere specific error payload type 2*/ ++static const struct db_fields amp_payload2_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "ce_report_reg", .type = "INTEGER" }, ++ { .name = "ce_location", .type = "INTEGER" }, ++ { .name = "ce_addr", .type = "INTEGER" }, ++ { .name = "ue_report_reg", .type = "INTEGER" }, ++ { .name = "ue_location", .type = "INTEGER" }, ++ { .name = "ue_addr", .type = "INTEGER" }, ++ { .name = "reserved1", .type = "INTEGER" }, ++ { .name = "reserved2", .type = "INTEGER" }, ++ { .name = "reserved2", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload2_event_tab = { ++ .name = "amp_payload2_event", ++ .fields = amp_payload2_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload2_event_fields), ++}; ++ ++/*key pair definition for ampere specific error payload type 3*/ ++static const struct db_fields amp_payload3_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "fw_spec_data0", .type = "INTEGER" }, ++ { .name = "fw_spec_data1", .type = "INTEGER" }, ++ { .name = "fw_spec_data2", .type = "INTEGER" }, ++ { .name = "fw_spec_data3", .type = "INTEGER" }, ++ { .name = "fw_spec_data4", .type = "INTEGER" }, ++ { .name = "fw_spec_data5", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload3_event_tab = { ++ .name = "amp_payload3_event", ++ .fields = amp_payload3_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload3_event_fields), ++}; ++ ++/*Save data with different type into sqlite3 db*/ ++static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder, ++ enum amp_oem_data_type data_type, ++ int id, int64_t data, const char *text) ++{ ++ switch (data_type) { ++ case AMP_OEM_DATA_TYPE_INT: ++ sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data); ++ break; ++ case AMP_OEM_DATA_TYPE_INT64: ++ sqlite3_bind_int64(ev_decoder->stmt_dec_record, id, data); ++ break; ++ case AMP_OEM_DATA_TYPE_TEXT: ++ sqlite3_bind_text(ev_decoder->stmt_dec_record, id, ++ text, -1, NULL); ++ break; ++ default: ++ break; ++ } ++} ++ ++static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, ++ const char *name) ++{ ++ int rc; ++ ++ rc = sqlite3_step(ev_decoder->stmt_dec_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do %s step on sqlite: error = %d\n", name, rc); ++ ++ rc = sqlite3_reset(ev_decoder->stmt_dec_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to reset %s on sqlite: error = %d\n", name, rc); ++ ++ rc = sqlite3_clear_bindings(ev_decoder->stmt_dec_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to clear bindings %s on sqlite: error = %d\n", ++ name, rc); ++ ++ return rc; ++} ++ ++/*save all Ampere Specific Error Payload type 0 to sqlite3 database*/ ++static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload0_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD0_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD0_FIELD_SUB_TYPE, 0, subtype_str); ++ ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD0_FIELD_INS, INSTANCE(err->instance), NULL); ++ ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD0_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD0_FIELD_STATUS_REG, err->err_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_ADDR_REG, ++ err->err_addr, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC0, ++ err->err_misc_0, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC1, ++ err->err_misc_1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC2, ++ err->err_misc_2, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC3, ++ err->err_misc_3, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload0_event_tab"); ++ } ++} ++ ++/*save all Ampere Specific Error Payload type 1 to sqlite3 database*/ ++static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload1_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD1_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD1_FIELD_SUB_TYPE, 0, subtype_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_INS, ++ INSTANCE(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS, ++ err->uncore_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK, ++ err->uncore_mask, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV, ++ err->uncore_sev, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS, ++ err->core_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK, ++ err->core_mask, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD, ++ err->root_err_cmd, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS, ++ err->root_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_SRC_ID, ++ err->src_id, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_RESERVED1, ++ err->reserved1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD1_FIELD_RESERVED2, ++ err->reserved2, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload1_event_tab"); ++ } ++} ++ ++/*save all Ampere Specific Error Payload type 2 to sqlite3 database*/ ++static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload2_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD2_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD2_FIELD_SUB_TYPE, 0, subtype_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_INS, INSTANCE(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_CE_REPORT_REG, ++ err->ce_register, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_CE_LOACATION, ++ err->ce_location, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_CE_ADDR, ++ err->ce_addr, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_UE_REPORT_REG, ++ err->ue_register, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_UE_LOCATION, ++ err->ue_location, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_UE_ADDR, ++ err->ue_addr, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_RESERVED1, ++ err->reserved1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD2_FIELD_RESERVED2, ++ err->reserved2, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD2_FIELD_RESERVED3, ++ err->reserved3, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload2_event_tab"); ++ } ++} ++ ++/*save all Ampere Specific Error Payload type 3 to sqlite3 database*/ ++static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload3_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD3_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD3_FIELD_SUB_TYPE, 0, subtype_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD3_FIELD_INS, INSTANCE(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD3_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0, ++ err->fw_speci_data0, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1, ++ err->fw_speci_data1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2, ++ err->fw_speci_data2, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3, ++ err->fw_speci_data3, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4, ++ err->fw_speci_data4, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5, ++ err->fw_speci_data5, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload3_event_tab"); ++ } ++} ++ ++#else ++static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder, ++ enum amp_oem_data_type data_type, ++ int id, int64_t data, const char *text) ++{ ++ return 0; ++} ++ ++static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload0_type_sec *err) ++{ ++ return 0; ++} ++ ++static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload1_type_sec *err) ++{ ++ return 0; ++} ++ ++static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload2_type_sec *err) ++{ ++ return 0; ++} ++ ++static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload3_type_sec *err) ++{ ++ return 0; ++} ++ ++static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, char *name) ++{ ++ return 0; ++} ++#endif + + /*decode ampere specific error payload type 0, the CPU's data is save*/ + /*to sqlite by ras-arm-handler, others are saved by this function.*/ +@@ -434,6 +794,7 @@ void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload0_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -517,6 +878,7 @@ static void decode_amp_payload1_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload1_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -601,6 +963,7 @@ static void decode_amp_payload2_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload2_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -673,6 +1036,7 @@ static void decode_amp_payload3_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload3_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -687,6 +1051,38 @@ static int decode_amp_oem_type_error(struct ras_events *ras, + { + int payload_type = PAYLOAD_TYPE(event->error[0]); + ++#ifdef HAVE_SQLITE3 ++ struct db_table_descriptor db_tab; ++ int id = 0; ++ ++ if (payload_type == PAYLOAD_TYPE_0) { ++ db_tab = amp_payload0_event_tab; ++ id = AMP_PAYLOAD0_FIELD_TIMESTAMP; ++ } else if (payload_type == PAYLOAD_TYPE_1) { ++ db_tab = amp_payload1_event_tab; ++ id = AMP_PAYLOAD1_FIELD_TIMESTAMP; ++ } else if (payload_type == PAYLOAD_TYPE_2) { ++ db_tab = amp_payload2_event_tab; ++ id = AMP_PAYLOAD2_FIELD_TIMESTAMP; ++ } else if (payload_type == PAYLOAD_TYPE_3) { ++ db_tab = amp_payload3_event_tab; ++ id = AMP_PAYLOAD3_FIELD_TIMESTAMP; ++ } else ++ return -1; ++ ++ if (!ev_decoder->stmt_dec_record) { ++ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, ++ &db_tab) != SQLITE_OK) { ++ trace_seq_printf(s, ++ "create sql %s fail\n", ++ sqlite3_table_list[payload_type]); ++ return -1; ++ } ++ } ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ id, 0, event->timestamp); ++#endif ++ + if (payload_type == PAYLOAD_TYPE_0) { + const struct amp_payload0_type_sec *err = + (struct amp_payload0_type_sec *)event->error; +diff --git a/non-standard-ampere.h b/non-standard-ampere.h +index aacf3a8..f463c53 100644 +--- a/non-standard-ampere.h ++++ b/non-standard-ampere.h +@@ -102,6 +102,79 @@ struct amp_payload3_type_sec { + uint64_t fw_speci_data5; + }; + ++enum amp_oem_data_type { ++ AMP_OEM_DATA_TYPE_INT, ++ AMP_OEM_DATA_TYPE_INT64, ++ AMP_OEM_DATA_TYPE_TEXT, ++}; ++ ++enum { ++ AMP_PAYLOAD0_FIELD_ID, ++ AMP_PAYLOAD0_FIELD_TIMESTAMP, ++ AMP_PAYLOAD0_FIELD_TYPE, ++ AMP_PAYLOAD0_FIELD_SUB_TYPE, ++ AMP_PAYLOAD0_FIELD_INS, ++ AMP_PAYLOAD0_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD0_FIELD_STATUS_REG, ++ AMP_PAYLOAD0_FIELD_ADDR_REG, ++ AMP_PAYLOAD0_FIELD_MISC0, ++ AMP_PAYLOAD0_FIELD_MISC1, ++ AMP_PAYLOAD0_FIELD_MISC2, ++ AMP_PAYLOAD0_FIELD_MISC3, ++}; ++ ++enum { ++ AMP_PAYLOAD1_FIELD_ID, ++ AMP_PAYLOAD1_FIELD_TIMESTAMP, ++ AMP_PAYLOAD1_FIELD_TYPE, ++ AMP_PAYLOAD1_FIELD_SUB_TYPE, ++ AMP_PAYLOAD1_FIELD_INS, ++ AMP_PAYLOAD1_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS, ++ AMP_PAYLOAD1_FIELD_SRC_ID, ++ AMP_PAYLOAD1_FIELD_RESERVED1, ++ AMP_PAYLOAD1_FIELD_RESERVED2, ++}; ++ ++enum { ++ AMP_PAYLOAD2_FIELD_ID, ++ AMP_PAYLOAD2_FIELD_TIMESTAMP, ++ AMP_PAYLOAD2_FIELD_TYPE, ++ AMP_PAYLOAD2_FIELD_SUB_TYPE, ++ AMP_PAYLOAD2_FIELD_INS, ++ AMP_PAYLOAD2_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD2_FIELD_CE_REPORT_REG, ++ AMP_PAYLOAD2_FIELD_CE_LOACATION, ++ AMP_PAYLOAD2_FIELD_CE_ADDR, ++ AMP_PAYLOAD2_FIELD_UE_REPORT_REG, ++ AMP_PAYLOAD2_FIELD_UE_LOCATION, ++ AMP_PAYLOAD2_FIELD_UE_ADDR, ++ AMP_PAYLOAD2_FIELD_RESERVED1, ++ AMP_PAYLOAD2_FIELD_RESERVED2, ++ AMP_PAYLOAD2_FIELD_RESERVED3, ++}; ++ ++enum { ++ AMP_PAYLOAD3_FIELD_ID, ++ AMP_PAYLOAD3_FIELD_TIMESTAMP, ++ AMP_PAYLOAD3_FIELD_TYPE, ++ AMP_PAYLOAD3_FIELD_SUB_TYPE, ++ AMP_PAYLOAD3_FIELD_INS, ++ AMP_PAYLOAD3_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5 ++}; ++ + void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, + const struct amp_payload0_type_sec *err); +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 8ddd439..6f4cb2b 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -67,6 +67,9 @@ int ras_aer_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_aer_event ev; + char buf[BUF_LEN]; ++ char ipmi_add_sel[105]; ++ uint8_t sel_data[5]; ++ int seg, bus, dev, fn; + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -129,15 +132,19 @@ int ras_aer_event_handler(struct trace_seq *s, + switch (severity_val) { + case HW_EVENT_AER_UNCORRECTED_NON_FATAL: + ev.error_type = "Uncorrected (Non-Fatal)"; ++ sel_data[0] = 0xca; + break; + case HW_EVENT_AER_UNCORRECTED_FATAL: + ev.error_type = "Uncorrected (Fatal)"; ++ sel_data[0] = 0xca; + break; + case HW_EVENT_AER_CORRECTED: + ev.error_type = "Corrected"; ++ sel_data[0] = 0xbf; + break; + default: + ev.error_type = "Unknown severity"; ++ sel_data[0] = 0xbf; + } + trace_seq_puts(s, ev.error_type); + +@@ -151,5 +158,29 @@ int ras_aer_event_handler(struct trace_seq *s, + ras_report_aer_event(ras, &ev); + #endif + ++#ifdef HAVE_AMP_NS_DECODE ++ /* ++ * Get PCIe AER error source seg/bus/dev/fn and save it into ++ * BMC OEM SEL, ipmitool raw 0x0a 0x44 is IPMI command-Add SEL ++ * entry, please refer IPMI specificaiton chapter 31.6. 0xcd3a ++ * is manufactuer ID(ampere),byte 12 is sensor num(CE is 0xBF, ++ * UE is 0xCA), byte 13~14 is segment number, byte 15 is bus ++ * number, byte 16[7:3] is device number, byte 16[2:0] is ++ * function number ++ */ ++ sscanf(ev.dev_name, "%x:%x:%x.%x", &seg, &bus, &dev, &fn); ++ ++ sel_data[1] = seg & 0xff; ++ sel_data[2] = (seg & 0xff00) >> 8; ++ sel_data[3] = bus; ++ sel_data[4] = (((dev & 0x1f) << 3) | (fn & 0x7)); ++ ++ sprintf(ipmi_add_sel, ++ "ipmitool raw 0x0a 0x44 0x00 0x00 0xc0 0x00 0x00 0x00 0x00 0x3a 0xcd 0x00 0xc0 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x", ++ sel_data[0], sel_data[1], sel_data[2], sel_data[3], sel_data[4]); ++ ++ system(ipmi_add_sel); ++#endif ++ + return 0; + } diff --git a/7ccf12f5ae26a055926d175d908c7930293438c4.patch b/7ccf12f5ae26a055926d175d908c7930293438c4.patch new file mode 100644 index 0000000000000000000000000000000000000000..5a7a860ec3f654ebb980a963ca1e14c00b14e89b --- /dev/null +++ b/7ccf12f5ae26a055926d175d908c7930293438c4.patch @@ -0,0 +1,26 @@ +commit 7ccf12f5ae26a055926d175d908c7930293438c4 +Author: DmNosachev +Date: Fri Jul 23 17:28:33 2021 +0300 + + labels/supermicro: added Supermicro X11SCW + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 990fc9e..aea7c3c 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -133,4 +133,10 @@ Vendor: Supermicro + + Model: X11SCA, X11SCA-F + DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0; +- DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1; +\ No newline at end of file ++ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1; ++ ++ Model: X11SCW-F ++ DIMMA1: 0.1.0; ++ DIMMA2: 0.0.0; ++ DIMMB1: 0.1.1; ++ DIMMB2: 0.0.1; +\ No newline at end of file diff --git a/854364ba44aee9bc5646f6537fc744b0b54aff37.patch b/854364ba44aee9bc5646f6537fc744b0b54aff37.patch deleted file mode 100644 index 91bad1b71b6810e2fb9052521cd3e2b1381f31ca..0000000000000000000000000000000000000000 --- a/854364ba44aee9bc5646f6537fc744b0b54aff37.patch +++ /dev/null @@ -1,38 +0,0 @@ -commit 854364ba44aee9bc5646f6537fc744b0b54aff37 -Author: Muralidhara M K -Date: Thu Aug 20 21:00:57 2020 +0530 - - rasdaemon: Add 8 channel decoding for SMCA systems - - Current Scalable Machine Check Architecture (SMCA) systems support up - to 8 UMC channels. - - To find the UMC channel represented by a bank, look at the 6th nibble - in the MCA_IPID[InstanceId] field. - - Signed-off-by: Muralidhara M K - [ Adjust commit message. ] - Signed-off-by: Yazen Ghannam - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index d0b6cb6..7c619fd 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e) - */ - static int find_umc_channel(struct mce_event *e) - { -- uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; -- uint32_t instance_id = EXTRACT(e->ipid, 0, 31); -- int i, channel = -1; -- -- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) -- if (umc_instance_id[i] == instance_id) -- channel = i; -- -- return channel; -+ return EXTRACT(e->ipid, 0, 31) >> 20; - } - /* Decode extended errors according to Scalable MCA specification */ - static void decode_smca_error(struct mce_event *e) diff --git a/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch b/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch deleted file mode 100644 index e3617fc215bad1b418060340c7e0975061e1e5fa..0000000000000000000000000000000000000000 --- a/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch +++ /dev/null @@ -1,207 +0,0 @@ -commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6 -Author: Brian WoodsGhannam, Yazen -Date: Fri Nov 1 15:48:14 2019 +0100 - - rasdaemon: add support for new AMD SMCA bank types - - Going forward, the Scalable Machine Check Architecture (SMCA) has some - updated and additional bank types which show up in Zen2. The differing - bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2 - bank types replace the original bank types but have unique HWID/MCAtype - IDs from the originals so there's no conflicts between different - versions or other bank types. All of the differing bank types have new - MCE descriptions which have been added as well. - - CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types - Signed-off-by: Brian Woods - Signed-off-by: Yazen Ghannam - Cc: Chandu-babu Namburu - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 6c3e8a5..114e786 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -49,11 +49,17 @@ enum smca_bank_types { - SMCA_FP, /* Floating Point */ - SMCA_L3_CACHE, /* L3 Cache */ - SMCA_CS, /* Coherent Slave */ -+ SMCA_CS_V2, /* Coherent Slave V2 */ - SMCA_PIE, /* Power, Interrupts, etc. */ - SMCA_UMC, /* Unified Memory Controller */ - SMCA_PB, /* Parameter Block */ - SMCA_PSP, /* Platform Security Processor */ -+ SMCA_PSP_V2, /* Platform Security Processor V2 */ - SMCA_SMU, /* System Management Unit */ -+ SMCA_SMU_V2, /* System Management Unit V2 */ -+ SMCA_MP5, /* Microprocessor 5 Unit */ -+ SMCA_NBIO, /* Northbridge IO Unit */ -+ SMCA_PCIE, /* PCI Express Unit */ - N_SMCA_BANK_TYPES - }; - -@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = { - "Atomic request parity", - "ECC error on probe filter access", - }; -+/* Coherent Slave Unit V2 */ -+static const char * const smca_cs2_mce_desc[] = { -+ "Illegal Request", -+ "Address Violation", -+ "Security Violation", -+ "Illegal Response", -+ "Unexpected Response", -+ "Request or Probe Parity Error", -+ "Read Response Parity Error", -+ "Atomic Request Parity Error", -+ "SDP read response had no match in the CS queue", -+ "Probe Filter Protocol Error", -+ "Probe Filter ECC Error", -+ "SDP read response had an unexpected RETRY error", -+ "Counter overflow error", -+ "Counter underflow error", -+}; - /* Power, Interrupt, etc.. */ - static const char * const smca_pie_mce_desc[] = { - "HW assert", -@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = { - static const char * const smca_psp_mce_desc[] = { - "PSP RAM ECC or parity error", - }; -+/* Platform Security Processor V2 */ -+static const char * const smca_psp2_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Instruction Cache Bank 0 ECC or parity error", -+ "Instruction Cache Bank 1 ECC or parity error", -+ "Instruction Tag Ram 0 parity error", -+ "Instruction Tag Ram 1 parity error", -+ "Data Cache Bank 0 ECC or parity error", -+ "Data Cache Bank 1 ECC or parity error", -+ "Data Cache Bank 2 ECC or parity error", -+ "Data Cache Bank 3 ECC or parity error", -+ "Data Tag Bank 0 parity error", -+ "Data Tag Bank 1 parity error", -+ "Data Tag Bank 2 parity error", -+ "Data Tag Bank 3 parity error", -+ "Dirty Data Ram parity error", -+ "TLB Bank 0 parity error", -+ "TLB Bank 1 parity error", -+ "System Hub Read Buffer ECC or parity error", -+}; - /* System Management Unit */ - static const char * const smca_smu_mce_desc[] = { - "SMU RAM ECC or parity error", - }; -+/* System Management Unit V2 */ -+static const char * const smca_smu2_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Data Cache Bank A ECC or parity error", -+ "Data Cache Bank B ECC or parity error", -+ "Data Tag Cache Bank A ECC or parity error", -+ "Data Tag Cache Bank B ECC or parity error", -+ "Instruction Cache Bank A ECC or parity error", -+ "Instruction Cache Bank B ECC or parity error", -+ "Instruction Tag Cache Bank A ECC or parity error", -+ "Instruction Tag Cache Bank B ECC or parity error", -+ "System Hub Read Buffer ECC or parity error", -+}; -+/* Microprocessor 5 Unit */ -+static const char * const smca_mp5_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Data Cache Bank A ECC or parity error", -+ "Data Cache Bank B ECC or parity error", -+ "Data Tag Cache Bank A ECC or parity error", -+ "Data Tag Cache Bank B ECC or parity error", -+ "Instruction Cache Bank A ECC or parity error", -+ "Instruction Cache Bank B ECC or parity error", -+ "Instruction Tag Cache Bank A ECC or parity error", -+ "Instruction Tag Cache Bank B ECC or parity error", -+}; -+/* Northbridge IO Unit */ -+static const char * const smca_nbio_mce_desc[] = { -+ "ECC or Parity error", -+ "PCIE error", -+ "SDP ErrEvent error", -+ "SDP Egress Poison Error", -+ "IOHC Internal Poison Error", -+}; -+/* PCI Express Unit */ -+static const char * const smca_pcie_mce_desc[] = { -+ "CCIX PER Message logging", -+ "CCIX Read Response with Status: Non-Data Error", -+ "CCIX Write Response with Status: Non-Data Error", -+ "CCIX Read Response with Status: Data Error", -+ "CCIX Non-okay write response with data error", -+}; -+ - - struct smca_mce_desc { - const char * const *descs; -@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, - [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, - [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, -+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, - [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, - [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, - [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, - [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, -+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, - [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, -+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)}, -+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, -+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, -+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, - }; - - struct smca_hwid { -@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* Data Fabric MCA types */ - { SMCA_CS, 0x0000002E }, -+ { SMCA_CS_V2, 0x0002002E }, - { SMCA_PIE, 0x0001002E }, - - /* Unified Memory Controller MCA type */ -@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* Platform Security Processor MCA type */ - { SMCA_PSP, 0x000000FF }, -+ { SMCA_PSP_V2, 0x000100FF }, - - /* System Management Unit MCA type */ - { SMCA_SMU, 0x00000001 }, -+ { SMCA_SMU_V2, 0x00010001 }, -+ -+ /* Microprocessor 5 Unit MCA type */ -+ { SMCA_MP5, 0x00020001 }, -+ -+ /* Northbridge IO Unit MCA type */ -+ { SMCA_NBIO, 0x00000018 }, -+ -+ /* PCI Express Unit MCA type */ -+ { SMCA_PCIE, 0x00000046 }, - }; - - struct smca_bank_name { -@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = { - [SMCA_FP] = { "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "L3 Cache" }, - [SMCA_CS] = { "Coherent Slave" }, -+ [SMCA_CS_V2] = { "Coherent Slave" }, - [SMCA_PIE] = { "Power, Interrupts, etc." }, - [SMCA_UMC] = { "Unified Memory Controller" }, - [SMCA_PB] = { "Parameter Block" }, - [SMCA_PSP] = { "Platform Security Processor" }, -+ [SMCA_PSP_V2] = { "Platform Security Processor" }, - [SMCA_SMU] = { "System Management Unit" }, -+ [SMCA_SMU_V2] = { "System Management Unit" }, -+ [SMCA_MP5] = { "Microprocessor 5 Unit" }, -+ [SMCA_NBIO] = { "Northbridge IO Unit" }, -+ [SMCA_PCIE] = { "PCI Express Unit" }, - }; - - static void amd_decode_errcode(struct mce_event *e) diff --git a/9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch b/9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch new file mode 100644 index 0000000000000000000000000000000000000000..5267fc8d11c9eec9e3f4a3ebdc76e8c3861475cf --- /dev/null +++ b/9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch @@ -0,0 +1,51 @@ +commit 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b +Author: Muralidhara M K +Date: Tue Jul 27 06:36:45 2021 -0500 + + rasdaemon: ras-mc-ctl: Fix script to parse dimm sizes + + Removes trailing spaces at the end of a line from + file location and fixes --layout option to parse dimm nodes + to get the size of each dimm from ras-mc-ctl. + + Issue is reported https://github.com/mchehab/rasdaemon/issues/43 + Where '> ras-mc-ctl --layout' reports all 0s + + With this change the layout option prints the correct dimm sizes + > sudo ras-mc-ctl --layout + +-----------------------------------------------+ + | mc0 | + | csrow0 | csrow1 | csrow2 | csrow3 | + ----------+-----------------------------------------------+ + ... + channel7: | 16384 MB | 0 MB | 0 MB | 0 MB | + channel6: | 16384 MB | 0 MB | 0 MB | 0 MB | + ... + ----------+-----------------------------------------------+ + + Signed-off-by: Muralidhara M K + Signed-off-by: Naveen Krishna Chatradhi + Cc: Yazen Ghannam + Signed-off-by: Mauro Carvalho Chehab + Link: https://lkml.kernel.org/r/20210810183855.129076-1-nchatrad@amd.com/ + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1e3aeb7..b22dd60 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -246,6 +246,7 @@ sub parse_dimm_nodes + if (($file =~ /max_location$/)) { + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + my @temp = split(/ /, $location); + +@@ -288,6 +289,7 @@ sub parse_dimm_nodes + + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + + my @pos; diff --git a/9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch b/9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch new file mode 100644 index 0000000000000000000000000000000000000000..1a221ea0a362ba29dd14fded727c6a9e52d6dab1 --- /dev/null +++ b/9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch @@ -0,0 +1,40 @@ +commit 9a5baed97b21af31064d9995ffcfaac0e9d7983e +Author: DmNosachev +Date: Tue Jun 29 13:37:48 2021 +0300 + + labels/supermicro: supermicro db syntax + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index bfaed93..47ea05f 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -18,17 +18,17 @@ Vendor: Supermicro + DIMMA1: 0.0.0; DIMMA2: 0.0.1; + DIMMB1: 0.1.0; DIMMB2: 0.1.1; + +- Product: X10SRA-F +- DIMMA1: 0.0.0 +- DIMMA2: 0.0.1 +- DIMMB1: 0.1.0 +- DIMMB2: 0.1.1 +- DIMMC1: 1.0.0 +- DIMMC2: 1.0.1 +- DIMMD1: 1.1.0 +- DIMMD2: 1.1.1 ++ Model: X10SRA-F ++ DIMMA1: 0.0.0; ++ DIMMA2: 0.0.1; ++ DIMMB1: 0.1.0; ++ DIMMB2: 0.1.1; ++ DIMMC1: 1.0.0; ++ DIMMC2: 1.0.1; ++ DIMMD1: 1.1.0; ++ DIMMD2: 1.1.1; + +- Product: H8DGU ++ Model: H8DGU + P1_DIMM1A: 0.2.0; + P1_DIMM1A: 0.3.0; + P2_DIMM1A: 3.2.0; diff --git a/a16ca0711001957ee98f2c124abce0fa1f801529.patch b/a16ca0711001957ee98f2c124abce0fa1f801529.patch deleted file mode 100644 index 3a962638a9af22e1879f49be93c15a00242358a0..0000000000000000000000000000000000000000 --- a/a16ca0711001957ee98f2c124abce0fa1f801529.patch +++ /dev/null @@ -1,670 +0,0 @@ -commit a16ca0711001957ee98f2c124abce0fa1f801529 -Author: Chandu-babu Namburu -Date: Wed Jan 30 20:36:45 2019 +0530 - - rasdaemon: add support for AMD Scalable MCA - - Add logic here to decode errors from all known IP blocks for - AMD Scalable MCA supported processors - - Reviewed-by: Yazen Ghannam - Signed-off-by: Chandu-babu Namburu - ---- - mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - mce-amd.c | 122 +++++++++++++++++ - ras-mce-handler.c | 24 +++ - ras-mce-handler.h | 15 ++ - 4 files changed, 530 insertions(+), 2 deletions(-) - ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400 -@@ -0,0 +1,371 @@ -+/* -+ * Copyright (c) 2018, AMD, Inc. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 and -+ * only version 2 as published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+ -+#include "ras-mce-handler.h" -+#include "bitfield.h" -+ -+/* MCA_STATUS REGISTER FOR FAMILY 17H -+ *********************** Higher 32-bits ***************************** -+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE, -+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid, -+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet, -+ * 51: RES, 50: RES, 49: RES, 48: RES, -+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred, -+ * 43: Poison, 42: RES, 41: RES, 40: RES, -+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4], -+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0] -+ *********************** Lower 32-bits ****************************** -+ * 31: RES, 30: RES, 29: RES, 28: RES, -+ * 27: RES, 26: RES, 25: RES, 24: RES -+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4], -+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0] -+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12], -+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8], -+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4], -+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0] -+ */ -+ -+/* These may be used by multiple smca_hwid_mcatypes */ -+enum smca_bank_types { -+ SMCA_LS = 0, /* Load Store */ -+ SMCA_IF, /* Instruction Fetch */ -+ SMCA_L2_CACHE, /* L2 Cache */ -+ SMCA_DE, /* Decoder Unit */ -+ SMCA_RESERVED, /* Reserved */ -+ SMCA_EX, /* Execution Unit */ -+ SMCA_FP, /* Floating Point */ -+ SMCA_L3_CACHE, /* L3 Cache */ -+ SMCA_CS, /* Coherent Slave */ -+ SMCA_PIE, /* Power, Interrupts, etc. */ -+ SMCA_UMC, /* Unified Memory Controller */ -+ SMCA_PB, /* Parameter Block */ -+ SMCA_PSP, /* Platform Security Processor */ -+ SMCA_SMU, /* System Management Unit */ -+ N_SMCA_BANK_TYPES -+}; -+ -+/* SMCA Extended error strings */ -+/* Load Store */ -+static const char * const smca_ls_mce_desc[] = { -+ "Load queue parity", -+ "Store queue parity", -+ "Miss address buffer payload parity", -+ "L1 TLB parity", -+ "Reserved", -+ "DC tag error type 6", -+ "DC tag error type 1", -+ "Internal error type 1", -+ "Internal error type 2", -+ "Sys Read data error thread 0", -+ "Sys read data error thread 1", -+ "DC tag error type 2", -+ "DC data error type 1 (poison consumption)", -+ "DC data error type 2", -+ "DC data error type 3", -+ "DC tag error type 4", -+ "L2 TLB parity", -+ "PDC parity error", -+ "DC tag error type 3", -+ "DC tag error type 5", -+ "L2 fill data error", -+}; -+/* Instruction Fetch */ -+static const char * const smca_if_mce_desc[] = { -+ "microtag probe port parity error", -+ "IC microtag or full tag multi-hit error", -+ "IC full tag parity", -+ "IC data array parity", -+ "Decoupling queue phys addr parity error", -+ "L0 ITLB parity error", -+ "L1 ITLB parity error", -+ "L2 ITLB parity error", -+ "BPQ snoop parity on Thread 0", -+ "BPQ snoop parity on Thread 1", -+ "L1 BTB multi-match error", -+ "L2 BTB multi-match error", -+ "L2 Cache Response Poison error", -+ "System Read Data error", -+}; -+/* L2 Cache */ -+static const char * const smca_l2_mce_desc[] = { -+ "L2M tag multi-way-hit error", -+ "L2M tag ECC error", -+ "L2M data ECC error", -+ "HW assert", -+}; -+/* Decoder Unit */ -+static const char * const smca_de_mce_desc[] = { -+ "uop cache tag parity error", -+ "uop cache data parity error", -+ "Insn buffer parity error", -+ "uop queue parity error", -+ "Insn dispatch queue parity error", -+ "Fetch address FIFO parity", -+ "Patch RAM data parity", -+ "Patch RAM sequencer parity", -+ "uop buffer parity" -+}; -+/* Execution Unit */ -+static const char * const smca_ex_mce_desc[] = { -+ "Watchdog timeout error", -+ "Phy register file parity", -+ "Flag register file parity", -+ "Immediate displacement register file parity", -+ "Address generator payload parity", -+ "EX payload parity", -+ "Checkpoint queue parity", -+ "Retire dispatch queue parity", -+ "Retire status queue parity error", -+ "Scheduling queue parity error", -+ "Branch buffer queue parity error", -+}; -+/* Floating Point Unit */ -+static const char * const smca_fp_mce_desc[] = { -+ "Physical register file parity", -+ "Freelist parity error", -+ "Schedule queue parity", -+ "NSQ parity error", -+ "Retire queue parity", -+ "Status register file parity", -+ "Hardware assertion", -+}; -+/* L3 Cache */ -+static const char * const smca_l3_mce_desc[] = { -+ "Shadow tag macro ECC error", -+ "Shadow tag macro multi-way-hit error", -+ "L3M tag ECC error", -+ "L3M tag multi-way-hit error", -+ "L3M data ECC error", -+ "XI parity, L3 fill done channel error", -+ "L3 victim queue parity", -+ "L3 HW assert", -+}; -+/* Coherent Slave Unit */ -+static const char * const smca_cs_mce_desc[] = { -+ "Illegal request from transport layer", -+ "Address violation", -+ "Security violation", -+ "Illegal response from transport layer", -+ "Unexpected response", -+ "Parity error on incoming request or probe response data", -+ "Parity error on incoming read response data", -+ "Atomic request parity", -+ "ECC error on probe filter access", -+}; -+/* Power, Interrupt, etc.. */ -+static const char * const smca_pie_mce_desc[] = { -+ "HW assert", -+ "Internal PIE register security violation", -+ "Error on GMI link", -+ "Poison data written to internal PIE register", -+}; -+/* Unified Memory Controller */ -+static const char * const smca_umc_mce_desc[] = { -+ "DRAM ECC error", -+ "Data poison error on DRAM", -+ "SDP parity error", -+ "Advanced peripheral bus error", -+ "Command/address parity error", -+ "Write data CRC error", -+}; -+/* Parameter Block */ -+static const char * const smca_pb_mce_desc[] = { -+ "Parameter Block RAM ECC error", -+}; -+/* Platform Security Processor */ -+static const char * const smca_psp_mce_desc[] = { -+ "PSP RAM ECC or parity error", -+}; -+/* System Management Unit */ -+static const char * const smca_smu_mce_desc[] = { -+ "SMU RAM ECC or parity error", -+}; -+ -+struct smca_mce_desc { -+ const char * const *descs; -+ unsigned int num_descs; -+}; -+ -+static struct smca_mce_desc smca_mce_descs[] = { -+ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, -+ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, -+ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, -+ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, -+ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, -+ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, -+ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, -+ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, -+ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, -+ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, -+ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, -+ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, -+ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, -+}; -+ -+struct smca_hwid { -+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/ -+ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/ -+}; -+ -+static struct smca_hwid smca_hwid_mcatypes[] = { -+ /* { bank_type, mcatype_hwid } */ -+ -+ /* ZN Core (HWID=0xB0) MCA types */ -+ { SMCA_LS, 0x000000B0 }, -+ { SMCA_IF, 0x000100B0 }, -+ { SMCA_L2_CACHE, 0x000200B0 }, -+ { SMCA_DE, 0x000300B0 }, -+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */ -+ { SMCA_EX, 0x000500B0 }, -+ { SMCA_FP, 0x000600B0 }, -+ { SMCA_L3_CACHE, 0x000700B0 }, -+ -+ /* Data Fabric MCA types */ -+ { SMCA_CS, 0x0000002E }, -+ { SMCA_PIE, 0x0001002E }, -+ -+ /* Unified Memory Controller MCA type */ -+ { SMCA_UMC, 0x00000096 }, -+ -+ /* Parameter Block MCA type */ -+ { SMCA_PB, 0x00000005 }, -+ -+ /* Platform Security Processor MCA type */ -+ { SMCA_PSP, 0x000000FF }, -+ -+ /* System Management Unit MCA type */ -+ { SMCA_SMU, 0x00000001 }, -+}; -+ -+struct smca_bank_name { -+ const char *name; -+}; -+ -+static struct smca_bank_name smca_names[] = { -+ [SMCA_LS] = { "Load Store Unit" }, -+ [SMCA_IF] = { "Instruction Fetch Unit" }, -+ [SMCA_L2_CACHE] = { "L2 Cache" }, -+ [SMCA_DE] = { "Decode Unit" }, -+ [SMCA_RESERVED] = { "Reserved" }, -+ [SMCA_EX] = { "Execution Unit" }, -+ [SMCA_FP] = { "Floating Point Unit" }, -+ [SMCA_L3_CACHE] = { "L3 Cache" }, -+ [SMCA_CS] = { "Coherent Slave" }, -+ [SMCA_PIE] = { "Power, Interrupts, etc." }, -+ [SMCA_UMC] = { "Unified Memory Controller" }, -+ [SMCA_PB] = { "Parameter Block" }, -+ [SMCA_PSP] = { "Platform Security Processor" }, -+ [SMCA_SMU] = { "System Management Unit" }, -+}; -+ -+static void amd_decode_errcode(struct mce_event *e) -+{ -+ -+ decode_amd_errcode(e); -+ -+ if (e->status & MCI_STATUS_POISON) -+ mce_snprintf(e->mcistatus_msg, "Poison consumed"); -+ -+ if (e->status & MCI_STATUS_TCC) -+ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt"); -+ -+} -+/* -+ * To find the UMC channel represented by this bank we need to match on its -+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its -+ * IPID. -+ */ -+static int find_umc_channel(struct mce_event *e) -+{ -+ uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; -+ uint32_t instance_id = EXTRACT(e->ipid, 0, 31); -+ int i, channel = -1; -+ -+ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) -+ if (umc_instance_id[i] == instance_id) -+ channel = i; -+ -+ return channel; -+} -+/* Decode extended errors according to Scalable MCA specification */ -+static void decode_smca_error(struct mce_event *e) -+{ -+ enum smca_bank_types bank_type; -+ const char *ip_name; -+ unsigned short xec = (e->status >> 16) & 0x3f; -+ const struct smca_hwid *s_hwid; -+ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63); -+ unsigned int csrow = -1, channel = -1; -+ unsigned int i; -+ -+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { -+ s_hwid = &smca_hwid_mcatypes[i]; -+ if (mcatype_hwid == s_hwid->mcatype_hwid) { -+ bank_type = s_hwid->bank_type; -+ break; -+ } -+ } -+ -+ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) { -+ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID"); -+ return; -+ } -+ -+ if (bank_type >= N_SMCA_BANK_TYPES) { -+ strcpy(e->mcastatus_msg, "Don't know how to decode this bank"); -+ return; -+ } -+ -+ if (bank_type == SMCA_RESERVED) { -+ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n"); -+ return; -+ } -+ -+ ip_name = smca_names[bank_type].name; -+ -+ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank); -+ -+ /* Only print the descriptor of valid extended error code */ -+ if (xec < smca_mce_descs[bank_type].num_descs) -+ mce_snprintf(e->mcastatus_msg, -+ " %s.\n", smca_mce_descs[bank_type].descs[xec]); -+ -+ if (bank_type == SMCA_UMC && xec == 0) { -+ channel = find_umc_channel(e); -+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */ -+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", -+ channel, csrow); -+ } -+} -+ -+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) -+{ -+ uint64_t mcgstatus = e->mcgstatus; -+ -+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld", -+ (long long)e->mcgstatus); -+ -+ if (mcgstatus & MCG_STATUS_RIPV) -+ mce_snprintf(e->mcgstatus_msg, "RIPV"); -+ if (mcgstatus & MCG_STATUS_EIPV) -+ mce_snprintf(e->mcgstatus_msg, "EIPV"); -+ if (mcgstatus & MCG_STATUS_MCIP) -+ mce_snprintf(e->mcgstatus_msg, "MCIP"); -+ -+ decode_smca_error(e); -+ amd_decode_errcode(e); -+ return 0; -+} ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400 -@@ -0,0 +1,122 @@ -+/* -+ * Copyright (c) 2018, The AMD, Inc. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 and -+ * only version 2 as published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+ -+#include "ras-mce-handler.h" -+ -+/* Error Code Types */ -+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) -+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) -+ -+/* Error code: transaction type (TT) */ -+static char *transaction[] = { -+ "instruction", "data", "generic", "reserved" -+}; -+/* Error codes: cache level (LL) */ -+static char *cachelevel[] = { -+ "reserved", "L1", "L2", "L3/generic" -+}; -+/* Error codes: memory transaction type (RRRR) */ -+static char *memtrans[] = { -+ "generic", "generic read", "generic write", "data read", -+ "data write", "instruction fetch", "prefetch", "evict", "snoop", -+ "?", "?", "?", "?", "?", "?", "?" -+}; -+/* Participation Processor */ -+static char *partproc[] = { -+ "local node origin", "local node response", -+ "local node observed", "generic participation" -+}; -+/* Timeout */ -+static char *timeout[] = { -+ "request didn't time out", -+ "request timed out" -+}; -+/* internal unclassified error code */ -+static char *internal[] = { "reserved", -+ "reserved", -+ "hardware assert", -+ "reserved" }; -+ -+#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/ -+#define TT_MSG(x) transaction[TT(x)] -+#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/ -+#define LL_MSG(x) cachelevel[LL(x)] -+ -+#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */ -+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!") -+ -+#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/ -+#define TO_MSG(x) timeout[TO(x)] -+#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/ -+#define PP_MSG(x) partproc[PP(x)] -+ -+#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/ -+#define UU_MSG(x) internal[UU(x)] -+ -+void decode_amd_errcode(struct mce_event *e) -+{ -+ uint16_t ec = e->status & 0xffff; -+ uint16_t ecc = (e->status >> 45) & 0x3; -+ -+ if (e->status & MCI_STATUS_UC) { -+ if (e->status & MCI_STATUS_PCC) -+ strcpy(e->error_msg, "System Fatal error."); -+ if (e->mcgstatus & MCG_STATUS_RIPV) -+ strcpy(e->error_msg, -+ "Uncorrected, software restartable error."); -+ strcpy(e->error_msg, -+ "Uncorrected, software containable error."); -+ } else if (e->status & MCI_STATUS_DEFERRED) -+ strcpy(e->error_msg, "Deferred error, no action required."); -+ else -+ strcpy(e->error_msg, "Corrected error, no action required."); -+ -+ if (!(e->status & MCI_STATUS_VAL)) -+ mce_snprintf(e->mcistatus_msg, "MCE_INVALID"); -+ -+ if (e->status & MCI_STATUS_OVER) -+ mce_snprintf(e->mcistatus_msg, "Error_overflow"); -+ -+ if (e->status & MCI_STATUS_PCC) -+ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt"); -+ -+ if (ecc) -+ mce_snprintf(e->mcistatus_msg, -+ "%sECC", ((ecc == 2) ? "C" : "U")); -+ -+ if (INT_ERROR(ec)) { -+ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec)); -+ return; -+ } -+ -+ if (TLB_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "TLB Error 'tx: %s, level: %s'", -+ TT_MSG(ec), LL_MSG(ec)); -+ else if (MEM_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "Memory Error 'mem-tx: %s, tx: %s, level: %s'", -+ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec)); -+ else if (BUS_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "Bus Error '%s, %s, mem-tx: %s, level: %s'", -+ PP_MSG(ec), TO_MSG(ec), -+ R4_MSG(ec), LL_MSG(ec)); -+ return; -+ -+} ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400 -@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_LANDING] = "Knights Landing", - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", -+ [CPU_NAPLES] = "AMD Family 17h Zen1" - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -190,9 +191,12 @@ ret = 0; - if (!strcmp(mce->vendor, "AuthenticAMD")) { - if (mce->family == 15) - mce->cputype = CPU_K8; -- if (mce->family > 15) { -+ if (mce->family == 23) -+ mce->cputype = CPU_NAPLES; -+ if (mce->family > 23) { - log(ALL, LOG_INFO, -- "Can't parse MCE for this AMD CPU yet\n"); -+ "Can't parse MCE for this AMD CPU yet %d\n", -+ mce->family); - ret = EINVAL; - } - goto ret; -@@ -331,6 +335,12 @@ #if 0 - if (e->status & MCI_STATUS_ADDRV) - trace_seq_printf(s, ", addr= %llx", (long long)e->addr); - -+ if (e->status & MCI_STATUS_SYNDV) -+ trace_seq_printf(s, ", synd= %llx", (long long)e->synd); -+ -+ if (e->ipid) -+ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid); -+ - if (e->mcgstatus_msg) - trace_seq_printf(s, ", %s", e->mcgstatus_msg); - else -@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank - if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0) - return -1; - e.cpuvendor = val; -+ /* Get New entries */ -+ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0) -+ return -1; -+ e.synd = val; -+ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0) -+ return -1; -+ e.ipid = val; - - switch (mce->cputype) { - case CPU_GENERIC: -@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv - case CPU_K8: - rc = parse_amd_k8_event(ras, &e); - break; -+ case CPU_NAPLES: -+ rc = parse_amd_smca_event(ras, &e); -+ break; - default: /* All other CPU types are Intel */ - rc = parse_intel_event(ras, &e); - } ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400 -@@ -50,6 +50,7 @@ enum cputype { - CPU_KNIGHTS_LANDING, - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, -+ CPU_NAPLES, - }; - - struct mce_event { -@@ -69,6 +70,8 @@ struct mce_event { - uint8_t cs; - uint8_t bank; - uint8_t cpuvendor; -+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ -+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ - - /* Parsed data */ - char timestamp[64]; -@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra - void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); - void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e); - -+/* AMD error code decode function */ -+void decode_amd_errcode(struct mce_event *e); -+ - /* Software defined banks */ - #define MCE_EXTENDED_BANK 128 - -@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /* - #define MCI_STATUS_S (1ULL<<56) /* signalled */ - #define MCI_STATUS_AR (1ULL<<55) /* action-required */ - -+/* AMD-specific bits */ -+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ -+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ -+/* uncorrected error,deferred exception */ -+#define MCI_STATUS_DEFERRED (1ULL<<44) -+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ -+ - #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ - #define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */ - #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ -@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events - - int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e); - -+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e); -+ - #endif ---- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400 -+++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400 -@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT) - @WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \ - @WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ - @WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \ --@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c -+@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c - - @WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c - @WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c -@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c - mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \ - mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \ - mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \ -- non-standard-hisi_hip07.c -+ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c - @WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT) - @WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT) - @WITH_NON_STANDARD_TRUE@am__objects_3 = \ -@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c - @WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \ - @WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \ - @WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \ --@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) -+@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \ -+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \ -+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT) - @WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT) - @WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT) - @WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \ -@@ -595,6 +597,8 @@ distclean-compile: - - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@ -+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@ -+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@ diff --git a/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch b/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch deleted file mode 100644 index 38657d4d78da230834b9ddc6b97fc052eeebfee6..0000000000000000000000000000000000000000 --- a/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch +++ /dev/null @@ -1,138 +0,0 @@ -commit a8c776ed94f68ae31d7b5f74e19545698898c13c -Author: Mauro Carvalho Chehab -Date: Tue Aug 14 13:06:27 2018 -0300 - - mce-intel-*: fix a warning when using FIELD(, NULL) - - Internally, FIELD() macro checks the size of an array, by - using ARRAY_SIZE. Well, this macro causes a division by zero - if NULL is used, as its type is void, as warned: - - mce-intel-dunnington.c:30:2: note: in expansion of macro ‘FIELD’ - FIELD(17, NULL), - ^~~~~ - ras-mce-handler.h:28:33: warning: division ‘sizeof (void *) / sizeof (void)’ does not compute the number of array elements [-Wsizeof-pointer-div] - #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) - ^ - bitfield.h:37:51: note: in expansion of macro ‘ARRAY_SIZE’ - #define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) } - ^~~~~~~~~~ - - While this warning is harmless, it may prevent seeing more serios - warnings. So, add a FIELD_NULL() macro to avoid that. - - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/bitfield.h b/bitfield.h -index c7dfeb1..fccbb36 100644 ---- a/bitfield.h -+++ b/bitfield.h -@@ -35,6 +35,7 @@ struct numfield { - }; - - #define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) } -+#define FIELD_NULL(start_bit) { start_bit, NULL, 0 } - #define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 } - - #define NUMBER(start, end, name) { start, end, name, "%Lu", 0 } -diff --git a/mce-intel-dunnington.c b/mce-intel-dunnington.c -index 4b1c7e3..c695c62 100644 ---- a/mce-intel-dunnington.c -+++ b/mce-intel-dunnington.c -@@ -27,14 +27,14 @@ - - static struct field dunnington_bus_status[] = { - SBITFIELD(16, "Parity error detected during FSB request phase"), -- FIELD(17, NULL), -+ FIELD_NULL(17), - SBITFIELD(20, "Hard Failure response received for a local transaction"), - SBITFIELD(21, "Parity error on FSB response field detected"), - SBITFIELD(22, "Parity data error on inbound data detected"), -- FIELD(23, NULL), -- FIELD(25, NULL), -- FIELD(28, NULL), -- FIELD(31, NULL), -+ FIELD_NULL(23), -+ FIELD_NULL(25), -+ FIELD_NULL(28), -+ FIELD_NULL(31), - {} - }; - -diff --git a/mce-intel-p4-p6.c b/mce-intel-p4-p6.c -index 4615e1a..5c6c3ff 100644 ---- a/mce-intel-p4-p6.c -+++ b/mce-intel-p4-p6.c -@@ -60,7 +60,7 @@ static char *bus_queue_error_type[] = { - }; - - static struct field p6_shared_status[] = { -- FIELD(16, NULL), -+ FIELD_NULL(16), - FIELD(19, bus_queue_req_type), - FIELD(25, bus_queue_error_type), - FIELD(25, bus_queue_error_type), -@@ -68,7 +68,7 @@ static struct field p6_shared_status[] = { - SBITFIELD(36, "received parity error on response transaction"), - SBITFIELD(38, "timeout BINIT (ROB timeout)." - " No micro-instruction retired for some time"), -- FIELD(39, NULL), -+ FIELD_NULL(39), - SBITFIELD(42, "bus transaction received hard error response"), - SBITFIELD(43, "failure that caused IERR"), - /* The following are reserved for Core in the SDM. Let's keep them here anyways*/ -@@ -76,15 +76,15 @@ static struct field p6_shared_status[] = { - SBITFIELD(45, "uncorrectable ECC error"), - SBITFIELD(46, "correctable ECC error"), - /* [47..54]: ECC syndrome */ -- FIELD(55, NULL), -+ FIELD_NULL(55), - {}, - }; - - static struct field p6old_status[] = { - SBITFIELD(28, "FRC error"), - SBITFIELD(29, "BERR on this CPU"), -- FIELD(31, NULL), -- FIELD(32, NULL), -+ FIELD_NULL(31), -+ FIELD_NULL(32), - SBITFIELD(35, "BINIT received from external bus"), - SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"), - {} -@@ -94,9 +94,9 @@ static struct field core2_status[] = { - SBITFIELD(28, "MCE driven"), - SBITFIELD(29, "MCE is observed"), - SBITFIELD(31, "BINIT observed"), -- FIELD(32, NULL), -+ FIELD_NULL(32), - SBITFIELD(34, "PIC or FSB data parity error"), -- FIELD(35, NULL), -+ FIELD_NULL(35), - SBITFIELD(37, "FSB address parity error detected"), - {} - }; -diff --git a/mce-intel-tulsa.c b/mce-intel-tulsa.c -index 6cea421..e59bf06 100644 ---- a/mce-intel-tulsa.c -+++ b/mce-intel-tulsa.c -@@ -39,7 +39,7 @@ static struct field tls_bus_status[] = { - SBITFIELD(16, "Parity error detected during FSB request phase"), - SBITFIELD(17, "Partity error detected on Core 0 request's address field"), - SBITFIELD(18, "Partity error detected on Core 1 request's address field"), -- FIELD(19, NULL), -+ FIELD_NULL(19), - SBITFIELD(20, "Parity error on FSB response field detected"), - SBITFIELD(21, "FSB data parity error on inbound date detected"), - SBITFIELD(22, "Data parity error on data received from Core 0 detected"), -@@ -48,8 +48,8 @@ static struct field tls_bus_status[] = { - SBITFIELD(25, "Data ECC event to error on inbound data correctable or uncorrectable"), - SBITFIELD(26, "Pad logic detected a data strobe glitch or sequencing error"), - SBITFIELD(27, "Pad logic detected a request strobe glitch or sequencing error"), -- FIELD(28, NULL), -- FIELD(31, NULL), -+ FIELD_NULL(28), -+ FIELD_NULL(31), - {} - }; - diff --git a/b22be68453b2497e86cbd273b9cd56fadc5859e3.patch b/b22be68453b2497e86cbd273b9cd56fadc5859e3.patch deleted file mode 100644 index 4b3b8aeae27d1dc1db2ca0b74b68b37acbca828d..0000000000000000000000000000000000000000 --- a/b22be68453b2497e86cbd273b9cd56fadc5859e3.patch +++ /dev/null @@ -1,37 +0,0 @@ -commit b22be68453b2497e86cbd273b9cd56fadc5859e3 -Author: Ying Lv -Date: Wed May 15 11:15:42 2019 +0800 - - fix rasdaemon high CPU usage when part of CPUs offline - - When we set part of CPU core offline, such as by setting the kernel cmdline - maxcpus = N(N is less than the total number of system CPU cores). - And then, we will observe that the CPU usage of some rasdaemon threads - is very close to 100. - - This is because when part of CPU offline, poll in read_ras_event_all_cpus func - will fallback to pthread way. - Offlined CPU thread will return negative value when read trace_pipe_raw, - negative return value will covert to positive value because of 'unsigned size'. - So code will always go into 'size > 0' branch, and the CPU usage is too high. - - Here, variable size uses int type will go to the right branch. - - Fiexs: eff7c9e0("ras-events: Only use pthreads for collect if poll() not available") - Reported-by: Zhipeng Xie - Signed-off-by: Ying Lv - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/ras-events.c b/ras-events.c -index 4e7b815..38ebe1e 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -426,7 +426,7 @@ static int read_ras_event(int fd, - struct kbuffer *kbuf, - void *page) - { -- unsigned size; -+ int size; - unsigned long long time_stamp; - void *data; - diff --git a/b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch b/b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch new file mode 100644 index 0000000000000000000000000000000000000000..30cc19e2098d9c1f95148ace0d27203b33c3c0f1 --- /dev/null +++ b/b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch @@ -0,0 +1,30 @@ +commit b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4 +Author: DmNosachev +Date: Tue Jun 29 13:48:55 2021 +0300 + + labels/supermicro: added Supermicro X10DRI(-T) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 47ea05f..86e4617 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -81,4 +81,14 @@ Vendor: Supermicro + P2-DIMMC1: 2.2.0; + P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1; + P2-DIMME1: 3.1.0; +- P2-DIMMF1: 3.2.0; +\ No newline at end of file ++ P2-DIMMF1: 3.2.0; ++ ++ Model: X10DRI, X10DRI-T ++ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; ++ P1-DIMMB1: 0.1.0; P1-DIMMB2: 0.1.1; ++ P1-DIMMC1: 0.2.0; P1-DIMMC2: 0.2.1; ++ P1-DIMMD1: 0.3.0; P1-DIMMD2: 0.3.1; ++ P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1; ++ P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1; ++ P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1; ++ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; +\ No newline at end of file diff --git a/b497a3d6a39d402c41065e9284d49114b97e3bfe.patch b/b497a3d6a39d402c41065e9284d49114b97e3bfe.patch deleted file mode 100644 index cbecbdc94ecad9a27c20e230d20796f844027b95..0000000000000000000000000000000000000000 --- a/b497a3d6a39d402c41065e9284d49114b97e3bfe.patch +++ /dev/null @@ -1,148 +0,0 @@ -commit b497a3d6a39d402c41065e9284d49114b97e3bfe -Author: Shiju Jose -Date: Mon Mar 8 16:57:28 2021 +0000 - - rasdaemon: ras-mc-ctl: Add memory failure events - - Add supporting memory failure errors (memory_failure_event) - to the ras-mc-ctl tool. - - Sample Log, - ras-mc-ctl --summary - ... - Memory failure events summary: - Delayed errors: 4 - Failed errors: 1 - ... - - ras-mc-ctl --errors - ... - Memory failure events: - 1 2020-10-28 23:20:41 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed - 2 2020-10-28 23:31:38 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed - 3 2020-10-28 23:54:54 -0800 error: pfn=0x205000000, page_type=free buddy page, action_result=Delayed - 4 2020-10-29 00:12:25 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed - 5 2020-10-29 00:26:36 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Failed - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - configure.ac | 11 +++++++++++ - util/ras-mc-ctl.in | 46 +++++++++++++++++++++++++++++++++++++++++++--- - 2 files changed, 54 insertions(+), 3 deletions(-) - ---- a/util/ras-mc-ctl.in 2021-10-13 13:51:00.887292563 -0400 -+++ b/util/ras-mc-ctl.in 2021-10-13 13:51:27.536061894 -0400 -@@ -44,11 +44,13 @@ my $modprobe = find_prog ("modprobe") - my $has_aer = 0; - my $has_arm = 0; - my $has_extlog = 0; -+my $has_mem_failure = 0; - my $has_mce = 0; - - @WITH_AER_TRUE@$has_aer = 1; - @WITH_ARM_TRUE@$has_arm = 1; - @WITH_EXTLOG_TRUE@$has_extlog = 1; -+@WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1; - @WITH_MCE_TRUE@$has_mce = 1; - - my %conf = (); -@@ -1132,7 +1134,7 @@ sub summary - { - require DBI; - my ($query, $query_handle, $out); -- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg); -+ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result); - my ($etype, $severity, $etype_string, $severity_string); - my ($affinity, $mpidr); - -@@ -1203,9 +1205,27 @@ sub summary - $out .= "\t$count $etype_string $severity_string errors\n"; - } - if ($out ne "") { -- print "Extlog records summary:\n$out"; -+ print "Extlog records summary:\n$out\n"; - } else { -- print "No Extlog errors.\n"; -+ print "No Extlog errors.\n\n"; -+ } -+ $query_handle->finish; -+ } -+ -+ # Memory failure errors -+ if ($has_mem_failure == 1) { -+ $query = "select action_result, count(*) from memory_failure_event group by action_result"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($action_result, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$action_result errors: $count\n"; -+ } -+ if ($out ne "") { -+ print "Memory failure events summary:\n$out\n"; -+ } else { -+ print "No Memory failure errors.\n\n"; - } - $query_handle->finish; - } -@@ -1238,6 +1258,7 @@ sub errors - my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); - my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); - my ($error_count, $affinity, $mpidr, $r_state, $psci_state); -+ my ($pfn, $page_type, $action_result); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1329,6 +1350,25 @@ $out .= sprintf "address=0x%08x, ", $add - } - $query_handle->finish; - } -+ -+ # Memory failure errors -+ if ($has_mem_failure == 1) { -+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n"; -+ } -+ if ($out ne "") { -+ print "Memory failure events:\n$out\n"; -+ } else { -+ print "No Memory failure errors.\n\n"; -+ } -+ $query_handle->finish; -+ } - - # MCE mce_record errors - if ($has_mce == 1) { ---- a/configure.ac 2018-04-25 06:28:51.000000000 -0400 -+++ b/configure.ac 2021-10-13 13:51:00.916292312 -0400 -@@ -80,6 +80,16 @@ AS_IF([test "x$enable_extlog" = "xyes"], - ]) - AM_CONDITIONAL([WITH_EXTLOG], [test x$enable_extlog = xyes]) - -+AC_ARG_ENABLE([memory_failure], -+ AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)])) -+ -+AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [ -+ AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect") -+ AC_SUBST([WITH_MEMORY_FAILURE]) -+]) -+AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all == xyes]) -+AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"]) -+ - AC_ARG_ENABLE([abrt_report], - AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) - -@@ -127,4 +137,5 @@ compile time options summary - ABRT report : $enable_abrt_report - HIP07 SAS HW errors : $enable_hisi_ns_decode - ARM events : $enable_arm -+ Memory Failure : $USE_MEMORY_FAILURE - EOF diff --git a/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch b/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch deleted file mode 100644 index 36c019db24dabc1d9d3cb707298591c0fceea787..0000000000000000000000000000000000000000 --- a/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch +++ /dev/null @@ -1,94 +0,0 @@ -commit cc2ce5c65ed5a42eaa97aa3659854add6d808da5 -Author: Muralidhara M K -Date: Mon Jan 13 19:12:06 2020 +0530 - - rasdaemon: Add error decoding for new SMCA Load Store bank type - - Future Scalable Machine Check Architecture (SMCA) systems will have a - new Load Store bank type. - - Add the new type's (HWID, McaType) ID and error decoding. - - Signed-off-by: Muralidhara M K - [ Adjust commit message. ] - Signed-off-by: Yazen Ghannam - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 114e786..d0b6cb6 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -38,9 +38,16 @@ - * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0] - */ - -+/* MCA_STATUS REGISTER FOR FAMILY 19H -+ * The bits 24 ~ 29 contains AddressLsb -+ * 29: ADDRLS[5], 28: ADDRLS[4], 27: ADDRLS[3], -+ * 26: ADDRLS[2], 25: ADDRLS[1], 24: ADDRLS[0] -+ */ -+ - /* These may be used by multiple smca_hwid_mcatypes */ - enum smca_bank_types { - SMCA_LS = 0, /* Load Store */ -+ SMCA_LS_V2, /* Load Store */ - SMCA_IF, /* Instruction Fetch */ - SMCA_L2_CACHE, /* L2 Cache */ - SMCA_DE, /* Decoder Unit */ -@@ -88,6 +95,32 @@ static const char * const smca_ls_mce_desc[] = { - "DC tag error type 5", - "L2 fill data error", - }; -+static const char * const smca_ls2_mce_desc[] = { -+ "An ECC error was detected on a data cache read by a probe or victimization", -+ "An ECC error or L2 poison was detected on a data cache read by a load", -+ "An ECC error was detected on a data cache read-modify-write by a store", -+ "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", -+ "An ECC error or poison bit mismatch was detected on a tag read by a load", -+ "An ECC error or poison bit mismatch was detected on a tag read by a store", -+ "An ECC error was detected on an EMEM read by a load", -+ "An ECC error was detected on an EMEM read-modify-write by a store", -+ "A parity error was detected in an L1 TLB entry by any access", -+ "A parity error was detected in an L2 TLB entry by any access", -+ "A parity error was detected in a PWC entry by any access", -+ "A parity error was detected in an STQ entry by any access", -+ "A parity error was detected in an LDQ entry by any access", -+ "A parity error was detected in a MAB entry by any access", -+ "A parity error was detected in an SCB entry state field by any access", -+ "A parity error was detected in an SCB entry address field by any access", -+ "A parity error was detected in an SCB entry data field by any access", -+ "A parity error was detected in a WCB entry by any access", -+ "A poisoned line was detected in an SCB entry by any access", -+ "A SystemReadDataError error was reported on read data returned from L2 for a load", -+ "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", -+ "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", -+ "A hardware assertion error was reported", -+ "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", -+}; - /* Instruction Fetch */ - static const char * const smca_if_mce_desc[] = { - "microtag probe port parity error", -@@ -289,6 +322,7 @@ struct smca_mce_desc { - - static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, -+ [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, - [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, - [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, - [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, -@@ -319,6 +353,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, 0x000000B0 }, -+ { SMCA_LS_V2, 0x001000B0 }, - { SMCA_IF, 0x000100B0 }, - { SMCA_L2_CACHE, 0x000200B0 }, - { SMCA_DE, 0x000300B0 }, -@@ -362,6 +397,7 @@ struct smca_bank_name { - - static struct smca_bank_name smca_names[] = { - [SMCA_LS] = { "Load Store Unit" }, -+ [SMCA_LS_V2] = { "Load Store Unit" }, - [SMCA_IF] = { "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "L2 Cache" }, - [SMCA_DE] = { "Decode Unit" }, diff --git a/ce6e7864f11f709c4f803828fbc8e507d115d03b.patch b/ce6e7864f11f709c4f803828fbc8e507d115d03b.patch deleted file mode 100644 index e10c156b988f64cb0f4cff1c2bff8c5cd873909e..0000000000000000000000000000000000000000 --- a/ce6e7864f11f709c4f803828fbc8e507d115d03b.patch +++ /dev/null @@ -1,611 +0,0 @@ -commit ce6e7864f11f709c4f803828fbc8e507d115d03b -Author: Greg Edwards -Date: Thu Apr 8 15:03:30 2021 -0600 - - rasdaemon: Add Ice Lake and Sapphire Rapids MSCOD values - - Based on mcelog commits: - - ee90ff20ce6a ("mcelog: Add support for Icelake server, Icelake-D, and Snow Ridge") - 391abaac9bdf ("mcelog: Add decode for MCi_MISC from 10nm memory controller") - 59cb7ad4bc72 ("mcelog: i10nm: Fix mapping from bank number to functional unit") - c0acd0e6a639 ("mcelog: Add support for Sapphirerapids server.") - - Signed-off-by: Greg Edwards - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 3 - mce-intel-i10nm.c | 509 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - mce-intel.c | 5 - ras-mce-handler.c | 12 + - ras-mce-handler.h | 5 - 5 files changed, 533 insertions(+), 1 deletion(-) - ---- rasdaemon-0.6.1.orig/Makefile.am 2021-09-17 15:29:45.977790658 -0400 -+++ rasdaemon-0.6.1/Makefile.am 2021-09-17 15:29:57.439698580 -0400 -@@ -36,7 +36,8 @@ if WITH_MCE - mce-intel-dunnington.c mce-intel-tulsa.c \ - mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ - mce-intel-knl.c mce-intel-broadwell-de.c \ -- mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c -+ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c \ -+ mce-amd.c mce-amd-smca.c mce-intel-i10nm.c - endif - if WITH_EXTLOG - rasdaemon_SOURCES += ras-extlog-handler.c ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-intel-i10nm.c 2021-09-17 15:29:45.977790658 -0400 -@@ -0,0 +1,509 @@ -+/* -+ * The code below came from Tony Luck's mcelog code, -+ * released under GNU Public General License, v.2 -+ * -+ * Copyright (C) 2019 Intel Corporation -+ * Decode Intel 10nm specific machine check errors. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+*/ -+ -+#include -+#include -+#include -+ -+#include "ras-mce-handler.h" -+#include "bitfield.h" -+ -+static char *pcu_1[] = { -+ [0x0D] = "MCA_LLC_BIST_ACTIVE_TIMEOUT", -+ [0x0E] = "MCA_DMI_TRAINING_TIMEOUT", -+ [0x0F] = "MCA_DMI_STRAP_SET_ARRIVAL_TIMEOUT", -+ [0x10] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT", -+ [0x11] = "MCA_MORE_THAN_ONE_LT_AGENT", -+ [0x14] = "MCA_INCOMPATIBLE_PCH_TYPE", -+ [0x1E] = "MCA_BIOS_RST_CPL_INVALID_SEQ", -+ [0x1F] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG", -+ [0x2D] = "MCA_PCU_PMAX_CALIB_ERROR", -+ [0x2E] = "MCA_TSC100_SYNC_TIMEOUT", -+ [0x3A] = "MCA_GPSB_TIMEOUT", -+ [0x3B] = "MCA_PMSB_TIMEOUT", -+ [0x3E] = "MCA_IOSFSB_PMREQ_CMP_TIMEOUT", -+ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE", -+ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE", -+ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR", -+ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED", -+ [0x45] = "MCA_SVID_SA_ITD_ERROR", -+ [0x46] = "MCA_SVID_READ_REG_FAILED", -+ [0x47] = "MCA_SVID_WRITE_REG_FAILED", -+ [0x4A] = "MCA_SVID_PKGC_REQUEST_FAILED", -+ [0x4B] = "MCA_SVID_IMON_REQUEST_FAILED", -+ [0x4C] = "MCA_SVID_ALERT_REQUEST_FAILED", -+ [0x4D] = "MCA_SVID_MCP_VR_RAMP_ERROR", -+ [0x56] = "MCA_FIVR_PD_HARDERR", -+ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE", -+ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER", -+ [0x5A] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER", -+ [0x5B] = "MCA_WATCHDOG_TIMEOUT_MSG_CH_FSM", -+ [0x5C] = "MCA_WATCHDOG_TIMEOUT_BULK_CR_FSM", -+ [0x5D] = "MCA_WATCHDOG_TIMEOUT_IOSFSB_FSM", -+ [0x60] = "MCA_PKGS_SAFE_WP_TIMEOUT", -+ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT", -+ [0x62] = "MCA_PKGS_INVALID_REQ_PCH", -+ [0x63] = "MCA_PKGS_INVALID_REQ_INTERNAL", -+ [0x64] = "MCA_PKGS_INVALID_RSP_INTERNAL", -+ [0x65 ... 0x7A] = "MCA_PKGS_RESET_PREP_TIMEOUT", -+ [0x7B] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT", -+ [0x7C] = "MCA_PKGS_SMBUS_MCP_PAUSE_TIMEOUT", -+ [0x7D] = "MCA_PKGS_SMBUS_SPD_PAUSE_TIMEOUT", -+ [0x80] = "MCA_PKGC_DISP_BUSY_TIMEOUT", -+ [0x81] = "MCA_PKGC_INVALID_RSP_PCH", -+ [0x83] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN", -+ [0x84] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP", -+ [0x87] = "MCA_PKGC_WATCHDOG_HANG_C2_BLKMASTER", -+ [0x88] = "MCA_PKGC_WATCHDOG_HANG_C2_PSLIMIT", -+ [0x89] = "MCA_PKGC_WATCHDOG_HANG_SETDISP", -+ [0x8B] = "MCA_PKGC_ALLOW_L1_ERROR", -+ [0x90] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT", -+ [0xA0] = "MCA_ADR_SIGNAL_TIMEOUT", -+ [0xA1] = "MCA_BCLK_FREQ_OC_ABOVE_THRESHOLD", -+ [0xB0] = "MCA_DISPATCHER_RUN_BUSY_TIMEOUT", -+}; -+ -+static char *pcu_2[] = { -+ [0x04] = "Clock/power IP response timeout", -+ [0x05] = "SMBus controller raised SMI", -+ [0x09] = "PM controller received invalid transaction", -+}; -+ -+static char *pcu_3[] = { -+ [0x01] = "Instruction address out of valid space", -+ [0x02] = "Double bit RAM error on Instruction Fetch", -+ [0x03] = "Invalid OpCode seen", -+ [0x04] = "Stack Underflow", -+ [0x05] = "Stack Overflow", -+ [0x06] = "Data address out of valid space", -+ [0x07] = "Double bit RAM error on Data Fetch", -+}; -+ -+static struct field pcu1[] = { -+ FIELD(0, pcu_1), -+ {} -+}; -+ -+static struct field pcu2[] = { -+ FIELD(0, pcu_2), -+ {} -+}; -+ -+static struct field pcu3[] = { -+ FIELD(0, pcu_3), -+ {} -+}; -+ -+static struct field upi1[] = { -+ SBITFIELD(22, "Phy Control Error"), -+ SBITFIELD(23, "Unexpected Retry.Ack flit"), -+ SBITFIELD(24, "Unexpected Retry.Req flit"), -+ SBITFIELD(25, "RF parity error"), -+ SBITFIELD(26, "Routeback Table error"), -+ SBITFIELD(27, "Unexpected Tx Protocol flit (EOP, Header or Data)"), -+ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"), -+ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"), -+ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"), -+ SBITFIELD(31, "Link Layer Tx Parity Error"), -+ {} -+}; -+ -+static char *upi_2[] = { -+ [0x00] = "Phy Initialization Failure (NumInit)", -+ [0x01] = "Phy Detected Drift Buffer Alarm", -+ [0x02] = "Phy Detected Latency Buffer Rollover", -+ [0x10] = "LL Rx detected CRC error: unsuccessful LLR (entered Abort state)", -+ [0x11] = "LL Rx Unsupported/Undefined packet", -+ [0x12] = "LL or Phy Control Error", -+ [0x13] = "LL Rx Parameter Exception", -+ [0x1F] = "LL Detected Control Error", -+ [0x20] = "Phy Initialization Abort", -+ [0x21] = "Phy Inband Reset", -+ [0x22] = "Phy Lane failure, recovery in x8 width", -+ [0x23] = "Phy L0c error corrected without Phy reset", -+ [0x24] = "Phy L0c error triggering Phy reset", -+ [0x25] = "Phy L0p exit error corrected with reset", -+ [0x30] = "LL Rx detected CRC error: successful LLR without Phy Reinit", -+ [0x31] = "LL Rx detected CRC error: successful LLR with Phy Reinit", -+ [0x32] = "Tx received LLR", -+}; -+ -+static struct field upi2[] = { -+ FIELD(0, upi_2), -+ {} -+}; -+ -+static struct field m2m[] = { -+ SBITFIELD(16, "MC read data error"), -+ SBITFIELD(17, "Reserved"), -+ SBITFIELD(18, "MC partial write data error"), -+ SBITFIELD(19, "Full write data error"), -+ SBITFIELD(20, "M2M clock-domain-crossing buffer (BGF) error"), -+ SBITFIELD(21, "M2M time out"), -+ SBITFIELD(22, "M2M tracker parity error"), -+ SBITFIELD(23, "fatal Bucket1 error"), -+ {} -+}; -+ -+static char *imc_0[] = { -+ [0x01] = "Address parity error", -+ [0x02] = "Data parity error", -+ [0x03] = "Data ECC error", -+ [0x04] = "Data byte enable parity error", -+ [0x07] = "Transaction ID parity error", -+ [0x08] = "Corrected patrol scrub error", -+ [0x10] = "Uncorrected patrol scrub error", -+ [0x20] = "Corrected spare error", -+ [0x40] = "Uncorrected spare error", -+ [0x80] = "Corrected read error", -+ [0xA0] = "Uncorrected read error", -+ [0xC0] = "Uncorrected metadata", -+}; -+ -+static char *imc_1[] = { -+ [0x00] = "WDB read parity error", -+ [0x03] = "RPA parity error", -+ [0x06] = "DDR_T_DPPP data BE error", -+ [0x07] = "DDR_T_DPPP data error", -+ [0x08] = "DDR link failure", -+ [0x11] = "PCLS CAM error", -+ [0x12] = "PCLS data error", -+}; -+ -+static char *imc_2[] = { -+ [0x00] = "DDR4 command / address parity error", -+ [0x20] = "HBM command / address parity error", -+ [0x21] = "HBM data parity error", -+}; -+ -+static char *imc_4[] = { -+ [0x00] = "RPQ parity (primary) error", -+}; -+ -+static char *imc_8[] = { -+ [0x00] = "DDR-T bad request", -+ [0x01] = "DDR Data response to an invalid entry", -+ [0x02] = "DDR data response to an entry not expecting data", -+ [0x03] = "DDR4 completion to an invalid entry", -+ [0x04] = "DDR-T completion to an invalid entry", -+ [0x05] = "DDR data/completion FIFO overflow", -+ [0x06] = "DDR-T ERID correctable parity error", -+ [0x07] = "DDR-T ERID uncorrectable error", -+ [0x08] = "DDR-T interrupt received while outstanding interrupt was not ACKed", -+ [0x09] = "ERID FI FO overflow", -+ [0x0A] = "DDR-T error on FNV write credits", -+ [0x0B] = "DDR-T error on FNV read credits", -+ [0x0C] = "DDR-T scheduler error", -+ [0x0D] = "DDR-T FNV error event", -+ [0x0E] = "DDR-T FNV thermal event", -+ [0x0F] = "CMI packet while idle", -+ [0x10] = "DDR_T_RPQ_REQ_PARITY_ERR", -+ [0x11] = "DDR_T_WPQ_REQ_PARITY_ERR", -+ [0x12] = "2LM_NMFILLWR_CAM_ERR", -+ [0x13] = "CMI_CREDIT_OVERSUB_ERR", -+ [0x14] = "CMI_CREDIT_TOTAL_ERR", -+ [0x15] = "CMI_CREDIT_RSVD_POOL_ERR", -+ [0x16] = "DDR_T_RD_ERROR", -+ [0x17] = "WDB_FIFO_ERR", -+ [0x18] = "CMI_REQ_FIFO_OVERFLOW", -+ [0x19] = "CMI_REQ_FIFO_UNDERFLOW", -+ [0x1A] = "CMI_RSP_FIFO_OVERFLOW", -+ [0x1B] = "CMI_RSP_FIFO_UNDERFLOW", -+ [0x1C] = "CMI _MISC_MC_CRDT_ERRORS", -+ [0x1D] = "CMI_MISC_MC_ARB_ERRORS", -+ [0x1E] = "DDR_T_WR_CMPL_FI FO_OVERFLOW", -+ [0x1F] = "DDR_T_WR_CMPL_FI FO_UNDERFLOW", -+ [0x20] = "CMI_RD_CPL_FIFO_OVERFLOW", -+ [0x21] = "CMI_RD_CPL_FIFO_UNDERFLOW", -+ [0x22] = "TME_KEY_PAR_ERR", -+ [0x23] = "TME_CMI_MISC_ERR", -+ [0x24] = "TME_CMI_OVFL_ERR", -+ [0x25] = "TME_CMI_UFL_ERR", -+ [0x26] = "TME_TEM_SECURE_ERR", -+ [0x27] = "TME_UFILL_PAR_ERR", -+ [0x29] = "INTERNAL_ERR", -+ [0x2A] = "TME_INTEGRITY_ERR", -+ [0x2B] = "TME_TDX_ERR", -+ [0x2C] = "TME_UFILL_TEM_SECURE_ERR", -+ [0x2D] = "TME_KEY_POISON_ERR", -+ [0x2E] = "TME_SECURITY_ENGINE_ERR", -+}; -+ -+static char *imc_10[] = { -+ [0x08] = "CORR_PATSCRUB_MIRR2ND_ERR", -+ [0x10] = "UC_PATSCRUB_MIRR2ND_ERR", -+ [0x20] = "COR_SPARE_MIRR2ND_ERR", -+ [0x40] = "UC_SPARE_MIRR2ND_ERR", -+ [0x80] = "HA_RD_MIRR2ND_ERR", -+ [0xA0] = "HA_UNCORR_RD_MIRR2ND_ERR", -+}; -+ -+static struct field imc0[] = { -+ FIELD(0, imc_0), -+ {} -+}; -+ -+static struct field imc1[] = { -+ FIELD(0, imc_1), -+ {} -+}; -+ -+static struct field imc2[] = { -+ FIELD(0, imc_2), -+ {} -+}; -+ -+static struct field imc4[] = { -+ FIELD(0, imc_4), -+ {} -+}; -+ -+static struct field imc8[] = { -+ FIELD(0, imc_8), -+ {} -+}; -+ -+static struct field imc10[] = { -+ FIELD(0, imc_10), -+ {} -+}; -+ -+static void i10nm_imc_misc(struct mce_event *e) -+{ -+ uint32_t column = EXTRACT(e->misc, 9, 18) << 2; -+ uint32_t row = EXTRACT(e->misc, 19, 39); -+ uint32_t bank = EXTRACT(e->misc, 42, 43); -+ uint32_t bankgroup = EXTRACT(e->misc, 40, 41) | (EXTRACT(e->misc, 44, 44) << 2); -+ uint32_t fdevice = EXTRACT(e->misc, 46, 51); -+ uint32_t subrank = EXTRACT(e->misc, 52, 55); -+ uint32_t rank = EXTRACT(e->misc, 56, 58); -+ uint32_t eccmode = EXTRACT(e->misc, 59, 62); -+ uint32_t transient = EXTRACT(e->misc, 63, 63); -+ -+ mce_snprintf(e->error_msg, "bank: 0x%x bankgroup: 0x%x row: 0x%x column: 0x%x", bank, bankgroup, row, column); -+ if (!transient && !EXTRACT(e->status, 61, 61)) -+ mce_snprintf(e->error_msg, "failed device: 0x%x", fdevice); -+ mce_snprintf(e->error_msg, "rank: 0x%x subrank: 0x%x", rank, subrank); -+ mce_snprintf(e->error_msg, "ecc mode: "); -+ switch (eccmode) { -+ case 0: mce_snprintf(e->error_msg, "SDDC memory mode"); break; -+ case 1: mce_snprintf(e->error_msg, "SDDC"); break; -+ case 4: mce_snprintf(e->error_msg, "ADDDC memory mode"); break; -+ case 5: mce_snprintf(e->error_msg, "ADDDC"); break; -+ case 8: mce_snprintf(e->error_msg, "DDRT read"); break; -+ default: mce_snprintf(e->error_msg, "unknown"); break; -+ } -+ if (transient) -+ mce_snprintf(e->error_msg, "transient"); -+} -+ -+enum banktype { -+ BT_UNKNOWN, -+ BT_PCU, -+ BT_UPI, -+ BT_M2M, -+ BT_IMC, -+}; -+ -+static enum banktype icelake[32] = { -+ [4] = BT_PCU, -+ [5] = BT_UPI, -+ [7 ... 8] = BT_UPI, -+ [12] = BT_M2M, -+ [16] = BT_M2M, -+ [20] = BT_M2M, -+ [24] = BT_M2M, -+ [13 ... 15] = BT_IMC, -+ [17 ... 19] = BT_IMC, -+ [21 ... 23] = BT_IMC, -+ [25 ... 27] = BT_IMC, -+}; -+ -+static enum banktype icelake_de[32] = { -+ [4] = BT_PCU, -+ [12] = BT_M2M, -+ [16] = BT_M2M, -+ [13 ... 15] = BT_IMC, -+ [17 ... 19] = BT_IMC, -+}; -+ -+static enum banktype tremont[32] = { -+ [4] = BT_PCU, -+ [12] = BT_M2M, -+ [13 ... 15] = BT_IMC, -+}; -+ -+static enum banktype sapphire[32] = { -+ [4] = BT_PCU, -+ [5] = BT_UPI, -+ [12] = BT_M2M, -+ [13 ... 20] = BT_IMC, -+}; -+ -+void i10nm_memerr_misc(struct mce_event *e, int *channel); -+ -+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, -+ struct mce_event *e) -+{ -+ enum banktype banktype; -+ uint64_t f, status = e->status; -+ uint32_t mca = status & 0xffff; -+ int channel = -1; -+ -+ switch (cputype) { -+ case CPU_ICELAKE_XEON: -+ banktype = icelake[e->bank]; -+ break; -+ case CPU_ICELAKE_DE: -+ banktype = icelake_de[e->bank]; -+ break; -+ case CPU_TREMONT_D: -+ banktype = tremont[e->bank]; -+ break; -+ case CPU_SAPPHIRERAPIDS: -+ banktype = sapphire[e->bank]; -+ break; -+ default: -+ return; -+ } -+ -+ switch (banktype) { -+ case BT_UNKNOWN: -+ break; -+ -+ case BT_PCU: -+ mce_snprintf(e->error_msg, "PCU: "); -+ f = EXTRACT(status, 24, 31); -+ if (f) -+ decode_bitfield(e, f, pcu1); -+ f = EXTRACT(status, 20, 23); -+ if (f) -+ decode_bitfield(e, f, pcu2); -+ f = EXTRACT(status, 16, 19); -+ if (f) -+ decode_bitfield(e, f, pcu3); -+ break; -+ -+ case BT_UPI: -+ mce_snprintf(e->error_msg, "UPI: "); -+ f = EXTRACT(status, 22, 31); -+ if (f) -+ decode_bitfield(e, status, upi1); -+ f = EXTRACT(status, 16, 21); -+ decode_bitfield(e, f, upi2); -+ break; -+ -+ case BT_M2M: -+ mce_snprintf(e->error_msg, "M2M: "); -+ f = EXTRACT(status, 24, 25); -+ mce_snprintf(e->error_msg, "MscodDDRType=0x%" PRIx64, f); -+ f = EXTRACT(status, 26, 31); -+ mce_snprintf(e->error_msg, "MscodMiscErrs=0x%" PRIx64, f); -+ decode_bitfield(e, status, m2m); -+ break; -+ -+ case BT_IMC: -+ mce_snprintf(e->error_msg, "MemCtrl: "); -+ f = EXTRACT(status, 16, 23); -+ switch (EXTRACT(status, 24, 31)) { -+ case 0: decode_bitfield(e, f, imc0); break; -+ case 1: decode_bitfield(e, f, imc1); break; -+ case 2: decode_bitfield(e, f, imc2); break; -+ case 4: decode_bitfield(e, f, imc4); break; -+ case 8: decode_bitfield(e, f, imc8); break; -+ case 0x10: decode_bitfield(e, f, imc10); break; -+ } -+ i10nm_imc_misc(e); -+ break; -+ } -+ -+ /* -+ * Memory error specific code. Returns if the error is not a MC one -+ */ -+ -+ /* Check if the error is at the memory controller */ -+ if ((mca >> 7) != 1) -+ return; -+ -+ /* Ignore unless this is an corrected extended error from an iMC bank */ -+ if (banktype != BT_IMC || (status & MCI_STATUS_UC)) -+ return; -+ -+ /* -+ * Parse the reported channel -+ */ -+ -+ i10nm_memerr_misc(e, &channel); -+ if (channel == -1) -+ return; -+ mce_snprintf(e->mc_location, "memory_channel=%d", channel); -+} -+ -+/* -+ * There isn't enough information to identify the DIMM. But -+ * we can derive the channel from the bank number. -+ * There can be four memory controllers with two channels each. -+ */ -+void i10nm_memerr_misc(struct mce_event *e, int *channel) -+{ -+ uint64_t status = e->status; -+ unsigned int chan, imc; -+ -+ /* Check this is a memory error */ -+ if (!test_prefix(7, status & 0xefff)) -+ return; -+ -+ chan = EXTRACT(status, 0, 3); -+ if (chan == 0xf) -+ return; -+ -+ switch (e->bank) { -+ case 12: /* M2M 0 */ -+ case 13: /* IMC 0, Channel 0 */ -+ case 14: /* IMC 0, Channel 1 */ -+ case 15: /* IMC 0, Channel 2 */ -+ imc = 0; -+ break; -+ case 16: /* M2M 1 */ -+ case 17: /* IMC 1, Channel 0 */ -+ case 18: /* IMC 1, Channel 1 */ -+ case 19: /* IMC 1, Channel 2 */ -+ imc = 1; -+ break; -+ case 20: /* M2M 2 */ -+ case 21: /* IMC 2, Channel 0 */ -+ case 22: /* IMC 2, Channel 1 */ -+ case 23: /* IMC 2, Channel 2 */ -+ imc = 2; -+ break; -+ case 24: /* M2M 3 */ -+ case 25: /* IMC 3, Channel 0 */ -+ case 26: /* IMC 3, Channel 1 */ -+ case 27: /* IMC 3, Channel 2 */ -+ imc = 3; -+ break; -+ default: -+ return; -+ } -+ -+ channel[0] = imc * 3 + chan; -+} ---- rasdaemon-0.6.1.orig/mce-intel.c 2021-09-17 15:29:39.189845188 -0400 -+++ rasdaemon-0.6.1/mce-intel.c 2021-09-17 15:29:45.977790658 -0400 -@@ -411,6 +411,11 @@ if (test_prefix(11, (e->status & 0xffffL - case CPU_SKYLAKE_XEON: - skylake_s_decode_model(ras, e); - break; -+ case CPU_ICELAKE_XEON: -+ case CPU_ICELAKE_DE: -+ case CPU_TREMONT_D: -+ case CPU_SAPPHIRERAPIDS: -+ i10nm_decode_model(mce->cputype, ras, e); - default: - break; - } ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-09-17 15:29:39.189845188 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-09-17 15:29:45.977790658 -0400 -@@ -56,6 +56,10 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", - [CPU_AMD_SMCA] = "AMD Scalable MCA", -+ [CPU_ICELAKE_XEON] = "Icelake server", -+ [CPU_ICELAKE_DE] = "Icelake server D Family", -+ [CPU_TREMONT_D] = "Tremont microserver", -+ [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server", - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -107,6 +111,14 @@ else if (mce->model == 0x85) - return CPU_KNIGHTS_MILL; - else if (mce->model == 0x55) - return CPU_SKYLAKE_XEON; -+ else if (mce->model == 0x6a) -+ return CPU_ICELAKE_XEON; -+ else if (mce->model == 0x6c) -+ return CPU_ICELAKE_DE; -+ else if (mce->model == 0x86) -+ return CPU_TREMONT_D; -+ else if (mce->model == 0x8f) -+ return CPU_SAPPHIRERAPIDS; - - if (mce->model > 0x1a) { - log(ALL, LOG_INFO, ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-09-17 15:29:39.189845188 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-09-17 15:29:45.977790658 -0400 -@@ -51,6 +51,10 @@ enum cputype { - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, - CPU_AMD_SMCA, -+ CPU_ICELAKE_XEON, -+ CPU_ICELAKE_DE, -+ CPU_TREMONT_D, -+ CPU_SAPPHIRERAPIDS, - }; - - struct mce_event { -@@ -131,6 +135,7 @@ void tulsa_decode_model(struct mce_event - void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e); - void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); - void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e); -+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, struct mce_event *e); - - /* AMD error code decode function */ - void decode_amd_errcode(struct mce_event *e); diff --git a/d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch b/d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch new file mode 100644 index 0000000000000000000000000000000000000000..d28ce9c7db5e6861044aa68d839615a6c1d3dfff --- /dev/null +++ b/d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch @@ -0,0 +1,42 @@ +commit d0e0bb3d73c4bc5060da20270a089857bba2a64c +Author: Justin Vreeland +Date: Tue Nov 2 19:51:50 2021 -0700 + + Update ras-mc-ctl manpage to match current options + + Signed-off-by: Justin Vreeland + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/man/ras-mc-ctl.8.in b/man/ras-mc-ctl.8.in +index 26230e0..a605122 100644 +--- a/man/ras-mc-ctl.8.in ++++ b/man/ras-mc-ctl.8.in +@@ -79,9 +79,27 @@ Specify an alternate location for the labels database. + Specify a delay of \fBtime\fR seconds before registering DIMM labels. + Only meaninful if used together with --register-labels. + .TP +-.BI "--layout ++.BI "--layout" + Prints the memory layout as detected by the EDAC driver. Useful to check + if the EDAC driver is properly detecting the memory controller architecture. ++.TP ++.BI "--summary" ++Presents a summary of the logged errors. ++.TP ++.BI "--errors" ++Shows the errors stored at the error database. ++.TP ++.BI "--error-count" ++Shows the corrected and uncorrected error counts using sysfs. ++.TP ++.BI "--vendor-errors-summary="platform-id ++Pressents a summary of the vendor-specific logged errors. ++.TP ++.BI "--vendor-errors="platform-id ++Shows the vendor-specific errors stored in the error database. ++.TP ++.BI "--vendor-platforms" ++Shows the supported platforms with platform-ids for the vendor-specific errors. + + .SH MAINBOARD CONFIGURATION + .PP diff --git a/dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch b/dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch new file mode 100644 index 0000000000000000000000000000000000000000..b9eec5adf863ace7ee3cb60030592da4985b70ce --- /dev/null +++ b/dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch @@ -0,0 +1,27 @@ +commit dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b +Author: Mauro Carvalho Chehab +Date: Wed May 26 12:55:54 2021 +0200 + + Add support for multi-arch builds + + Allow building rasdaemon on several architectures: + - x86_64 + - arm 64 + - ppc 64 LE + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml +index 747a844..898687c 100644 +--- a/.github/workflows/ci.yml ++++ b/.github/workflows/ci.yml +@@ -9,6 +9,9 @@ jobs: + Ubuntu: + name: Ubuntu + runs-on: ubuntu-latest ++ strategy: ++ matrix: ++ arch: [x64_64, aarch64, ppc64le] + steps: + - uses: actions/checkout@v2 + - name: prepare diff --git a/dist b/dist new file mode 100644 index 0000000000000000000000000000000000000000..89c1faffc18349bb12eee2371e9dc43bf419b95c --- /dev/null +++ b/dist @@ -0,0 +1 @@ +an9 diff --git a/download b/download index 6feb85428f65bf5e56a970c0f7f85e19e33ab686..9d0b9dfe63f420a80777a1b21191449fb557022c 100644 --- a/download +++ b/download @@ -1 +1 @@ -dc388ad15889efe295184277ad7c2860 rasdaemon-0.6.1.tar.bz2 +8404c50ab6ba72f41e9c948b8ac3c2cb rasdaemon-0.6.7.tar.bz2 diff --git a/ec443ec0add059fa897f844349e1a2345d81713c.patch b/ec443ec0add059fa897f844349e1a2345d81713c.patch new file mode 100644 index 0000000000000000000000000000000000000000..cf778c1ea3119adbcd3a0f2db418f8e82ed108a2 --- /dev/null +++ b/ec443ec0add059fa897f844349e1a2345d81713c.patch @@ -0,0 +1,31 @@ +commit ec443ec0add059fa897f844349e1a2345d81713c +Author: DmNosachev +Date: Tue Jun 29 11:33:10 2021 +0300 + + labels/supermicro: added x11dph-i labels + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 3fd6fee..bfaed93 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -68,3 +68,17 @@ Vendor: Supermicro + P1_DIMM4B: 1.1.1; + P2_DIMM4B: 2.0.1; + P2_DIMM4B: 2.1.1; ++ ++ Model: X11DPH-i ++ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; ++ P1-DIMMB1: 0.1.0; ++ P1-DIMMC1: 0.2.0; ++ P1-DIMMD1: 1.0.0; P1-DIMMD2: 1.0.1; ++ P1-DIMME1: 1.1.0; ++ P1-DIMMF1: 1.2.0; ++ P2-DIMMA1: 2.0.0; P2-DIMMA2: 2.0.1; ++ P2-DIMMB1: 2.1.0; ++ P2-DIMMC1: 2.2.0; ++ P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1; ++ P2-DIMME1: 3.1.0; ++ P2-DIMMF1: 3.2.0; +\ No newline at end of file diff --git a/f7cdd720297cd17e405a7170c04df89d1d9536f8.patch b/f7cdd720297cd17e405a7170c04df89d1d9536f8.patch new file mode 100644 index 0000000000000000000000000000000000000000..c2732e883c934590ea9d16d8b5479e0d2a17227c --- /dev/null +++ b/f7cdd720297cd17e405a7170c04df89d1d9536f8.patch @@ -0,0 +1,48 @@ +commit f7cdd720297cd17e405a7170c04df89d1d9536f8 +Author: Mauro Carvalho Chehab +Date: Wed May 26 12:35:55 2021 +0200 + + Add a github workflow for CI automation + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml +new file mode 100644 +index 0000000..5b3e757 +--- /dev/null ++++ b/.github/workflows/ci.yml +@@ -0,0 +1,34 @@ ++name: CI ++ ++# Should run only on branches and PR, as "on_tag.yml" will handle tags ++on: ++ push: ++ branches: master test ++ pull_request: ++ branches: master ++ ++jobs: ++ ++# ++# Linux ++# ++ Ubuntu: ++ name: Ubuntu ++ runs-on: ubuntu-20.04 ++ strategy: ++ matrix: ++ arch: [x64_64, aarch64, armv7, ppc64le] ++ steps: ++ - uses: actions/checkout@v2 ++ with: ++ arch: ${{ matrix.arch }} ++ - name: prepare ++ run: | ++ sudo apt-get update ++ sudo apt-get install -y build-essential sqlite3 ++ - name: build ++ run: | ++ autoreconf -vfi ++ ./configure --enable-all ++ make ++ sudo make install diff --git a/fc1dd37d422fc907416afd028514fff59b63ae12.patch b/fc1dd37d422fc907416afd028514fff59b63ae12.patch new file mode 100644 index 0000000000000000000000000000000000000000..460d2c1f974db84fe927d2b11ef1add691349712 --- /dev/null +++ b/fc1dd37d422fc907416afd028514fff59b63ae12.patch @@ -0,0 +1,30 @@ +commit fc1dd37d422fc907416afd028514fff59b63ae12 +Author: DmNosachev +Date: Wed Jun 30 16:49:18 2021 +0300 + + labels/supermicro: added Supermicro B1DRi + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 373de07..b924a32 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -105,4 +105,14 @@ Vendor: Supermicro + P2-DIMMC1: 2.2.0; + P2-DIMMD1: 3.0.0; + P2-DIMME1: 3.1.0; +- P2-DIMMF1: 3.2.0; +\ No newline at end of file ++ P2-DIMMF1: 3.2.0; ++ ++ Model: B1DRi ++ P1_DIMMA1: 0.0.0; ++ P1_DIMMB1: 0.1.0; ++ P1_DIMMC1: 0.2.0; ++ P1_DIMMD1: 0.3.0; ++ P2_DIMME1: 1.0.0; ++ P2_DIMMF1: 1.1.0; ++ P2_DIMMG1: 1.2.0; ++ P2_DIMMH1: 1.3.0; +\ No newline at end of file diff --git a/fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch b/fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch new file mode 100644 index 0000000000000000000000000000000000000000..a549df75ef10d4e6a2c6d468a58ae4497a92df5d --- /dev/null +++ b/fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch @@ -0,0 +1,28 @@ +commit fcdffdcb28ece67ed78e3575a3dce45d9dd4f015 +Author: Mauro Carvalho Chehab +Date: Wed May 26 10:37:52 2021 +0200 + + rasdaemon.spec.in: Fix the description on this example file + + While this is used just to test if building it is OK, better + to keep the logs nice ;-) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 6ef223f..afa4359 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -61,10 +61,10 @@ rm INSTALL %{buildroot}/usr/include/*.h + %changelog + + * Wed May 26 2021 Mauro Carvalho Chehab 0.6.7-1 +-- Bump to version 0.6.5 with several fixes and additions ++- Bump to version 0.6.7 with several fixes and additions + + * Tue Jul 21 2020 Mauro Carvalho Chehab 0.6.6-1 +-- Bump to version 0.6.5 with several fixes, new hip08 events and memory prediction analysis ++- Bump to version 0.6.6 with several fixes, new hip08 events and memory prediction analysis + + * Wed Nov 20 2019 Mauro Carvalho Chehab 0.6.5-1 + - Bump to version 0.6.5 with several fixes and improves PCIe events record diff --git a/add_upstream_labels.patch b/labels.patch similarity index 72% rename from add_upstream_labels.patch rename to labels.patch index 70a04dfa0b4855afd86dcfe89f02289275decb26..3eb072ecea477a31a1c3f6bcd52baac7f8b140c2 100644 --- a/add_upstream_labels.patch +++ b/labels.patch @@ -1,9 +1,40 @@ +Add labels directory from upstream + +Labels directory doesn't get exported by tarball releases. + +Signed-off-by: Aristeu Rozanski + --- - labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 152 insertions(+) + labels/asus | 20 +++++++ + labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + labels/supermicro | 70 ++++++++++++++++++++++++ + 3 files changed, 242 insertions(+) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/labels/dell 2020-02-20 11:53:39.574579258 -0500 ++++ rasdaemon-0.6.7/labels/asus 2022-02-08 15:44:53.563362010 -0500 +@@ -0,0 +1,20 @@ ++# RASDAEMON Motherboard DIMM labels Database file. ++# ++# Vendor-name and model-name are found from the program 'dmidecode' ++# labels are found from the silk screen on the motherboard. ++# ++#Vendor: ++# Product: ++# Model: ++#