From f5f4d03a84c3dcfbf6a1972d3b643dc47ff9b8a1 Mon Sep 17 00:00:00 2001 From: Zhao Hang Date: Wed, 26 Jul 2023 17:51:35 +0800 Subject: [PATCH] update to rasdaemon-0.6.7-8.el9 Signed-off-by: Zhao Hang --- ...n-add-rbtree-support-for-page-record.patch | 584 --------------- ...pport-for-memory-Corrected-Error-pre.patch | 646 ----------------- ...tification-support-when-page-goes-of.patch | 259 ------- ...a096c3a1d0f993703ab3299f1ddfadf53d7f.patch | 85 --- ...rasdaemon-avoid-multiple-definitions.patch | 24 - ...29b024c31d54a7f8a72eab094376c7be27f5.patch | 32 - ...f3d2a0fcd48add9462567c30fe0e14585fb4.patch | 32 + ...d65b97311dd5736838f1e285355f7f357046.patch | 538 -------------- ...217660351c08eb2f8bccebf939abba2f7e69.patch | 66 -- ...a26dcec389723f75d69d3da9c2f15f6c317d.patch | 63 ++ ...54b0d31e02e657171fd27f4e31d996756bc6.patch | 44 ++ ...5005b10fe909c66f1c90f2feb95712427c7d.patch | 43 ++ ...f713f667437fb6e283cc3dc090679eb47d08.patch | 372 ---------- ...1e4da4f2daf2b10143fc148a8043312b61e5.patch | 149 ---- ...3db1b6b3d73805179c21d1dd5521e8dc0f74.patch | 37 + ...afafdcb2e8b0ced32fff31b13754d571090b.patch | 610 ++++++++++++++++ ...12f5ae26a055926d175d908c7930293438c4.patch | 26 + ...64ba44aee9bc5646f6537fc744b0b54aff37.patch | 38 - ...a85d8dc3483423ec2934fee8132f85f8fdb6.patch | 207 ------ ...b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch | 51 ++ ...aed97b21af31064d9995ffcfaac0e9d7983e.patch | 40 ++ ...a0711001957ee98f2c124abce0fa1f801529.patch | 670 ------------------ ...76ed94f68ae31d7b5f74e19545698898c13c.patch | 138 ---- ...e68453b2497e86cbd273b9cd56fadc5859e3.patch | 37 - ...2d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch | 30 + ...a3d6a39d402c41065e9284d49114b97e3bfe.patch | 148 ---- ...e5c65ed5a42eaa97aa3659854add6d808da5.patch | 94 --- ...7864f11f709c4f803828fbc8e507d115d03b.patch | 611 ---------------- ...bb3d73c4bc5060da20270a089857bba2a64c.patch | 42 ++ ...d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch | 27 + dist | 1 + download | 2 +- ...3ec0add059fa897f844349e1a2345d81713c.patch | 31 + ...d720297cd17e405a7170c04df89d1d9536f8.patch | 48 ++ ...d37d422fc907416afd028514fff59b63ae12.patch | 30 + ...fdcb28ece67ed78e3575a3dce45d9dd4f015.patch | 28 + add_upstream_labels.patch => labels.patch | 110 ++- ...c-ctl-Fix-script-to-parse-dimm-sizes.patch | 47 -- rasdaemon.spec | 166 ++--- 39 files changed, 1378 insertions(+), 4828 deletions(-) delete mode 100644 0021-rasdaemon-add-rbtree-support-for-page-record.patch delete mode 100644 0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch delete mode 100644 0023-rasdaemon-Add-notification-support-when-page-goes-of.patch delete mode 100644 0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch delete mode 100644 1001-rasdaemon-avoid-multiple-definitions.patch delete mode 100644 16d929b024c31d54a7f8a72eab094376c7be27f5.patch create mode 100644 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch delete mode 100644 2290d65b97311dd5736838f1e285355f7f357046.patch delete mode 100644 2a1d217660351c08eb2f8bccebf939abba2f7e69.patch create mode 100644 2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch create mode 100644 2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch create mode 100644 50565005b10fe909c66f1c90f2feb95712427c7d.patch delete mode 100644 546cf713f667437fb6e283cc3dc090679eb47d08.patch delete mode 100644 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch create mode 100644 6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch create mode 100644 738bafafdcb2e8b0ced32fff31b13754d571090b.patch create mode 100644 7ccf12f5ae26a055926d175d908c7930293438c4.patch delete mode 100644 854364ba44aee9bc5646f6537fc744b0b54aff37.patch delete mode 100644 8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch create mode 100644 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch create mode 100644 9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch delete mode 100644 a16ca0711001957ee98f2c124abce0fa1f801529.patch delete mode 100644 a8c776ed94f68ae31d7b5f74e19545698898c13c.patch delete mode 100644 b22be68453b2497e86cbd273b9cd56fadc5859e3.patch create mode 100644 b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch delete mode 100644 b497a3d6a39d402c41065e9284d49114b97e3bfe.patch delete mode 100644 cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch delete mode 100644 ce6e7864f11f709c4f803828fbc8e507d115d03b.patch create mode 100644 d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch create mode 100644 dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch create mode 100644 dist create mode 100644 ec443ec0add059fa897f844349e1a2345d81713c.patch create mode 100644 f7cdd720297cd17e405a7170c04df89d1d9536f8.patch create mode 100644 fc1dd37d422fc907416afd028514fff59b63ae12.patch create mode 100644 fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch rename add_upstream_labels.patch => labels.patch (72%) delete mode 100644 rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch diff --git a/0021-rasdaemon-add-rbtree-support-for-page-record.patch b/0021-rasdaemon-add-rbtree-support-for-page-record.patch deleted file mode 100644 index 4a76a89..0000000 --- a/0021-rasdaemon-add-rbtree-support-for-page-record.patch +++ /dev/null @@ -1,584 +0,0 @@ -From 27794f4a5ff1453490bbcd805ad8e5b54516f015 Mon Sep 17 00:00:00 2001 -From: wuyun -Date: Sat, 20 Jun 2020 20:26:21 +0800 -Subject: [PATCH] rasdaemon: add rbtree support for page record - -commit 5fd96f457262052f7d06435af8a49689ffb6ffcf upstream - -The rbtree is very efficient for recording and querying fault page info. - -Signed-off-by: wuyun -Signed-off-by: lvying6 -Signed-off-by: Mauro Carvalho Chehab -Signed-off-by: Bixuan Cui ---- - rbtree.c | 384 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - rbtree.h | 165 +++++++++++++++++++++++++++ - 2 files changed, 549 insertions(+) - create mode 100644 rbtree.c - create mode 100644 rbtree.h - -diff --git a/rbtree.c b/rbtree.c -new file mode 100644 -index 0000000..d9b1bd4 ---- /dev/null -+++ b/rbtree.c -@@ -0,0 +1,384 @@ -+/* -+ Red Black Trees -+ (C) 1999 Andrea Arcangeli -+ (C) 2002 David Woodhouse -+ Taken from the Linux 2.6.30 source with some minor modificatons. -+ -+ This program is free software; you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 2 of the License, or -+ (at your option) any later version. -+ -+ This program is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software -+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ -+ linux/lib/rbtree.c -+*/ -+ -+#include "rbtree.h" -+ -+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *right = node->rb_right; -+ struct rb_node *parent = rb_parent(node); -+ -+ if ((node->rb_right = right->rb_left)) -+ rb_set_parent(right->rb_left, node); -+ right->rb_left = node; -+ -+ rb_set_parent(right, parent); -+ -+ if (parent) -+ { -+ if (node == parent->rb_left) -+ parent->rb_left = right; -+ else -+ parent->rb_right = right; -+ } -+ else -+ root->rb_node = right; -+ rb_set_parent(node, right); -+} -+ -+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *left = node->rb_left; -+ struct rb_node *parent = rb_parent(node); -+ -+ if ((node->rb_left = left->rb_right)) -+ rb_set_parent(left->rb_right, node); -+ left->rb_right = node; -+ -+ rb_set_parent(left, parent); -+ -+ if (parent) -+ { -+ if (node == parent->rb_right) -+ parent->rb_right = left; -+ else -+ parent->rb_left = left; -+ } -+ else -+ root->rb_node = left; -+ rb_set_parent(node, left); -+} -+ -+void rb_insert_color(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *parent, *gparent; -+ -+ while ((parent = rb_parent(node)) && rb_is_red(parent)) -+ { -+ gparent = rb_parent(parent); -+ -+ if (parent == gparent->rb_left) -+ { -+ { -+ register struct rb_node *uncle = gparent->rb_right; -+ if (uncle && rb_is_red(uncle)) -+ { -+ rb_set_black(uncle); -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ node = gparent; -+ continue; -+ } -+ } -+ -+ if (parent->rb_right == node) -+ { -+ struct rb_node *tmp; -+ __rb_rotate_left(parent, root); -+ tmp = parent; -+ parent = node; -+ node = tmp; -+ } -+ -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ __rb_rotate_right(gparent, root); -+ } else { -+ { -+ struct rb_node *uncle = gparent->rb_left; -+ if (uncle && rb_is_red(uncle)) -+ { -+ rb_set_black(uncle); -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ node = gparent; -+ continue; -+ } -+ } -+ -+ if (parent->rb_left == node) -+ { -+ struct rb_node *tmp; -+ __rb_rotate_right(parent, root); -+ tmp = parent; -+ parent = node; -+ node = tmp; -+ } -+ -+ rb_set_black(parent); -+ rb_set_red(gparent); -+ __rb_rotate_left(gparent, root); -+ } -+ } -+ -+ rb_set_black(root->rb_node); -+} -+ -+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, -+ struct rb_root *root) -+{ -+ struct rb_node *other; -+ -+ while ((!node || rb_is_black(node)) && node != root->rb_node) -+ { -+ if (parent->rb_left == node) -+ { -+ other = parent->rb_right; -+ if (rb_is_red(other)) -+ { -+ rb_set_black(other); -+ rb_set_red(parent); -+ __rb_rotate_left(parent, root); -+ other = parent->rb_right; -+ } -+ if ((!other->rb_left || rb_is_black(other->rb_left)) && -+ (!other->rb_right || rb_is_black(other->rb_right))) -+ { -+ rb_set_red(other); -+ node = parent; -+ parent = rb_parent(node); -+ } -+ else -+ { -+ if (!other->rb_right || rb_is_black(other->rb_right)) -+ { -+ rb_set_black(other->rb_left); -+ rb_set_red(other); -+ __rb_rotate_right(other, root); -+ other = parent->rb_right; -+ } -+ rb_set_color(other, rb_color(parent)); -+ rb_set_black(parent); -+ rb_set_black(other->rb_right); -+ __rb_rotate_left(parent, root); -+ node = root->rb_node; -+ break; -+ } -+ } -+ else -+ { -+ other = parent->rb_left; -+ if (rb_is_red(other)) -+ { -+ rb_set_black(other); -+ rb_set_red(parent); -+ __rb_rotate_right(parent, root); -+ other = parent->rb_left; -+ } -+ if ((!other->rb_left || rb_is_black(other->rb_left)) && -+ (!other->rb_right || rb_is_black(other->rb_right))) -+ { -+ rb_set_red(other); -+ node = parent; -+ parent = rb_parent(node); -+ } -+ else -+ { -+ if (!other->rb_left || rb_is_black(other->rb_left)) -+ { -+ rb_set_black(other->rb_right); -+ rb_set_red(other); -+ __rb_rotate_left(other, root); -+ other = parent->rb_left; -+ } -+ rb_set_color(other, rb_color(parent)); -+ rb_set_black(parent); -+ rb_set_black(other->rb_left); -+ __rb_rotate_right(parent, root); -+ node = root->rb_node; -+ break; -+ } -+ } -+ } -+ if (node) -+ rb_set_black(node); -+} -+ -+void rb_erase(struct rb_node *node, struct rb_root *root) -+{ -+ struct rb_node *child, *parent; -+ int color; -+ -+ if (!node->rb_left) -+ child = node->rb_right; -+ else if (!node->rb_right) -+ child = node->rb_left; -+ else -+ { -+ struct rb_node *old = node, *left; -+ -+ node = node->rb_right; -+ while ((left = node->rb_left) != NULL) -+ node = left; -+ child = node->rb_right; -+ parent = rb_parent(node); -+ color = rb_color(node); -+ -+ if (child) -+ rb_set_parent(child, parent); -+ if (parent == old) { -+ parent->rb_right = child; -+ parent = node; -+ } else -+ parent->rb_left = child; -+ -+ node->rb_parent_color = old->rb_parent_color; -+ node->rb_right = old->rb_right; -+ node->rb_left = old->rb_left; -+ -+ if (rb_parent(old)) -+ { -+ if (rb_parent(old)->rb_left == old) -+ rb_parent(old)->rb_left = node; -+ else -+ rb_parent(old)->rb_right = node; -+ } else -+ root->rb_node = node; -+ -+ rb_set_parent(old->rb_left, node); -+ if (old->rb_right) -+ rb_set_parent(old->rb_right, node); -+ goto color; -+ } -+ -+ parent = rb_parent(node); -+ color = rb_color(node); -+ -+ if (child) -+ rb_set_parent(child, parent); -+ if (parent) -+ { -+ if (parent->rb_left == node) -+ parent->rb_left = child; -+ else -+ parent->rb_right = child; -+ } -+ else -+ root->rb_node = child; -+ -+ color: -+ if (color == RB_BLACK) -+ __rb_erase_color(child, parent, root); -+} -+ -+/* -+ * This function returns the first node (in sort order) of the tree. -+ */ -+struct rb_node *rb_first(const struct rb_root *root) -+{ -+ struct rb_node *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return NULL; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+} -+ -+struct rb_node *rb_last(const struct rb_root *root) -+{ -+ struct rb_node *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return NULL; -+ while (n->rb_right) -+ n = n->rb_right; -+ return n; -+} -+ -+struct rb_node *rb_next(const struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+ if (rb_parent(node) == node) -+ return NULL; -+ -+ /* If we have a right-hand child, go down and then left as far -+ as we can. */ -+ if (node->rb_right) { -+ node = node->rb_right; -+ while (node->rb_left) -+ node=node->rb_left; -+ return (struct rb_node *)node; -+ } -+ -+ /* No right-hand children. Everything down and left is -+ smaller than us, so any 'next' node must be in the general -+ direction of our parent. Go up the tree; any time the -+ ancestor is a right-hand child of its parent, keep going -+ up. First time it's a left-hand child of its parent, said -+ parent is our 'next' node. */ -+ while ((parent = rb_parent(node)) && node == parent->rb_right) -+ node = parent; -+ -+ return parent; -+} -+ -+struct rb_node *rb_prev(const struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+ if (rb_parent(node) == node) -+ return NULL; -+ -+ /* If we have a left-hand child, go down and then right as far -+ as we can. */ -+ if (node->rb_left) { -+ node = node->rb_left; -+ while (node->rb_right) -+ node=node->rb_right; -+ return (struct rb_node *)node; -+ } -+ -+ /* No left-hand children. Go up till we find an ancestor which -+ is a right-hand child of its parent */ -+ while ((parent = rb_parent(node)) && node == parent->rb_left) -+ node = parent; -+ -+ return parent; -+} -+ -+void rb_replace_node(struct rb_node *victim, struct rb_node *new, -+ struct rb_root *root) -+{ -+ struct rb_node *parent = rb_parent(victim); -+ -+ /* Set the surrounding nodes to point to the replacement */ -+ if (parent) { -+ if (victim == parent->rb_left) -+ parent->rb_left = new; -+ else -+ parent->rb_right = new; -+ } else { -+ root->rb_node = new; -+ } -+ if (victim->rb_left) -+ rb_set_parent(victim->rb_left, new); -+ if (victim->rb_right) -+ rb_set_parent(victim->rb_right, new); -+ -+ /* Copy the pointers/colour from the victim to the replacement */ -+ *new = *victim; -+} -diff --git a/rbtree.h b/rbtree.h -new file mode 100644 -index 0000000..a8a0459 ---- /dev/null -+++ b/rbtree.h -@@ -0,0 +1,165 @@ -+/* -+ Red Black Trees -+ (C) 1999 Andrea Arcangeli -+ Taken from the Linux 2.6.30 source. -+ -+ This program is free software; you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 2 of the License, or -+ (at your option) any later version. -+ -+ This program is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software -+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ -+ linux/include/linux/rbtree.h -+ -+ To use rbtrees you'll have to implement your own insert and search cores. -+ This will avoid us to use callbacks and to drop drammatically performances. -+ I know it's not the cleaner way, but in C (not in C++) to get -+ performances and genericity... -+ -+ Some example of insert and search follows here. The search is a plain -+ normal search over an ordered tree. The insert instead must be implemented -+ int two steps: as first thing the code must insert the element in -+ order as a red leaf in the tree, then the support library function -+ rb_insert_color() must be called. Such function will do the -+ not trivial work to rebalance the rbtree if necessary. -+ -+----------------------------------------------------------------------- -+static inline struct page * rb_search_page_cache(struct inode * inode, -+ unsigned long offset) -+{ -+ struct rb_node * n = inode->i_rb_page_cache.rb_node; -+ struct page * page; -+ -+ while (n) -+ { -+ page = rb_entry(n, struct page, rb_page_cache); -+ -+ if (offset < page->offset) -+ n = n->rb_left; -+ else if (offset > page->offset) -+ n = n->rb_right; -+ else -+ return page; -+ } -+ return NULL; -+} -+ -+static inline struct page * __rb_insert_page_cache(struct inode * inode, -+ unsigned long offset, -+ struct rb_node * node) -+{ -+ struct rb_node ** p = &inode->i_rb_page_cache.rb_node; -+ struct rb_node * parent = NULL; -+ struct page * page; -+ -+ while (*p) -+ { -+ parent = *p; -+ page = rb_entry(parent, struct page, rb_page_cache); -+ -+ if (offset < page->offset) -+ p = &(*p)->rb_left; -+ else if (offset > page->offset) -+ p = &(*p)->rb_right; -+ else -+ return page; -+ } -+ -+ rb_link_node(node, parent, p); -+ -+ return NULL; -+} -+ -+static inline struct page * rb_insert_page_cache(struct inode * inode, -+ unsigned long offset, -+ struct rb_node * node) -+{ -+ struct page * ret; -+ if ((ret = __rb_insert_page_cache(inode, offset, node))) -+ goto out; -+ rb_insert_color(node, &inode->i_rb_page_cache); -+ out: -+ return ret; -+} -+----------------------------------------------------------------------- -+*/ -+ -+#ifndef _LINUX_RBTREE_H -+#define _LINUX_RBTREE_H -+ -+#include -+ -+#define container_of(ptr, type, member) ({ \ -+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ -+ (type *)( (char *)__mptr - offsetof(type,member) );}) -+ -+struct rb_node -+{ -+ unsigned long rb_parent_color; -+#define RB_RED 0 -+#define RB_BLACK 1 -+ struct rb_node *rb_right; -+ struct rb_node *rb_left; -+} __attribute__((aligned(sizeof(long)))); -+ /* The alignment might seem pointless, but allegedly CRIS needs it */ -+ -+struct rb_root -+{ -+ struct rb_node *rb_node; -+}; -+ -+ -+#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) -+#define rb_color(r) ((r)->rb_parent_color & 1) -+#define rb_is_red(r) (!rb_color(r)) -+#define rb_is_black(r) rb_color(r) -+#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) -+#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) -+ -+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -+{ -+ rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; -+} -+static inline void rb_set_color(struct rb_node *rb, int color) -+{ -+ rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; -+} -+ -+#define RB_ROOT (struct rb_root) { NULL, } -+#define rb_entry(ptr, type, member) container_of(ptr, type, member) -+ -+#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -+#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -+#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) -+ -+extern void rb_insert_color(struct rb_node *, struct rb_root *); -+extern void rb_erase(struct rb_node *, struct rb_root *); -+ -+/* Find logical next and previous nodes in a tree */ -+extern struct rb_node *rb_next(const struct rb_node *); -+extern struct rb_node *rb_prev(const struct rb_node *); -+extern struct rb_node *rb_first(const struct rb_root *); -+extern struct rb_node *rb_last(const struct rb_root *); -+ -+/* Fast replacement of a single node without remove/rebalance/add/rebalance */ -+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, -+ struct rb_root *root); -+ -+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, -+ struct rb_node ** rb_link) -+{ -+ node->rb_parent_color = (unsigned long )parent; -+ node->rb_left = node->rb_right = NULL; -+ -+ *rb_link = node; -+} -+ -+#endif /* _LINUX_RBTREE_H */ --- -1.8.3.1 - diff --git a/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch b/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch deleted file mode 100644 index 531865f..0000000 --- a/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch +++ /dev/null @@ -1,646 +0,0 @@ -From c62d0466b0e69ac8c724c9d917000f18aa147aae Mon Sep 17 00:00:00 2001 -From: wuyun -Date: Sat, 20 Jun 2020 20:26:22 +0800 -Subject: [PATCH] rasdaemon: add support for memory Corrected Error predictive failure analysis - -commit 9ae6b70effb8adc9572debc800b8e16173f74bb8 upstream - -Memory Corrected Error was corrected by hardware. These errors do not -require immediate software actions, but are still reported for -accounting and predictive failure analysis. - -Based on statistical results, some actions can be taken to prevent -Corrected Error from evoluting to Uncorrected Error. - -Signed-off-by: wuyun -Signed-off-by: lvying6 -Signed-off-by: Mauro Carvalho Chehab -Signed-off-by: Bixuan Cui ---- - Makefile.am | 7 +- - configure.ac | 12 ++ - man/rasdaemon.1.in | 7 + - misc/rasdaemon.env | 29 ++++ - misc/rasdaemon.service.in | 1 + - misc/rasdaemon.spec.in | 4 +- - ras-events.c | 6 + - ras-mc-handler.c | 7 + - ras-page-isolation.c | 332 ++++++++++++++++++++++++++++++++++++++ - ras-page-isolation.h | 66 ++++++++ - 10 files changed, 468 insertions(+), 3 deletions(-) - create mode 100644 misc/rasdaemon.env - create mode 100644 ras-page-isolation.c - create mode 100644 ras-page-isolation.h - -diff --git a/Makefile.am b/Makefile.am -index fccdeba..dc30ae7 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -1,6 +1,6 @@ - ACLOCAL_AMFLAGS=-I m4 - SUBDIRS = libtrace util man --SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in -+SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env - SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) - EXTRA_DIST = $(SYSTEMD_SERVICES_IN) - -@@ -51,13 +51,16 @@ endif - if WITH_HISI_NS_DECODE - rasdaemon_SOURCES += non-standard-hisi_hip07.c - endif -+if WITH_MEMORY_CE_PFA -+ rasdaemon_SOURCES += rbtree.c ras-page-isolation.c -+endif - - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ -- ras-memory-failure-handler.h -+ ras-memory-failure-handler.h rbtree.h ras-page-isolation.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -diff --git a/configure.ac b/configure.ac -index 8be33d9..1f95459 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -108,6 +108,17 @@ AS_IF([test "x$enable_hisi_ns_decode" = "xyes"], [ - ]) - AM_CONDITIONAL([WITH_HISI_NS_DECODE], [test x$enable_hisi_ns_decode = xyes]) - -+AC_ARG_ENABLE([memory_ce_pfa], -+ AS_HELP_STRING([--enable-memory-ce-pfa], [enable memory Corrected Error predictive failure analysis])) -+ -+AS_IF([test "x$enable_memory_ce_pfa" = "xyes"], [ -+ AC_DEFINE(HAVE_MEMORY_CE_PFA,1,"have memory corrected error predictive failure analysis") -+ AC_SUBST([WITH_MEMORY_CE_PFA]) -+]) -+AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes]) -+AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"]) -+ -+ - test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc - - CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -138,4 +149,5 @@ compile time options summary - HIP07 SAS HW errors : $enable_hisi_ns_decode - ARM events : $enable_arm - Memory Failure : $USE_MEMORY_FAILURE -+ Memory CE PFA : $enable_memory_ce_pfa - EOF -diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in -index 834df16..833c8e1 100644 ---- a/man/rasdaemon.1.in -+++ b/man/rasdaemon.1.in -@@ -62,6 +62,13 @@ feature. - .BI "--version" - Print the program version and exit. - -+.SH CONFIG FILE -+ -+The \fBrasdaemon\fR program supports a config file to set rasdaemon systemd service -+environment variables. By default the config file is read from /etc/sysconfig/rasdaemon. -+ -+The general format is environmentname=value. -+ - .SH SEE ALSO - \fBras-mc-ctl\fR(8) - -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -new file mode 100644 -index 0000000..12fd766 ---- /dev/null -+++ b/misc/rasdaemon.env -@@ -0,0 +1,29 @@ -+# Page Isolation -+# Note: Run-time configuration is unsupported, service restart needed. -+# Note: this file should be installed at /etc/sysconfig/rasdaemon -+ -+# Specify the threshold of isolating buggy pages. -+# -+# Format: -+# [0-9]+[unit] -+# Notice: please make sure match this format, rasdaemon will use default value for exception input cases. -+# -+# Supported units: -+# PAGE_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour -+# PAGE_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none -+# -+# The two configs will only take no effect when PAGE_CE_ACTION is "off". -+PAGE_CE_REFRESH_CYCLE="24h" -+PAGE_CE_THRESHOLD="50" -+ -+# Specify the internal action in rasdaemon to exceeding a page error threshold. -+# -+# off no action -+# account only account errors -+# soft try to soft-offline page without killing any processes -+# This requires an uptodate kernel. Might not be successfull. -+# hard try to hard-offline page by killing processes -+# Requires an uptodate kernel. Might not be successfull. -+# soft-then-hard First try to soft offline, then try hard offlining. -+# Note: default offline choice is "soft". -+PAGE_CE_ACTION="soft" -diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in -index be9ad5a..e73a08a 100644 ---- a/misc/rasdaemon.service.in -+++ b/misc/rasdaemon.service.in -@@ -3,6 +3,7 @@ Description=RAS daemon to log the RAS events - After=syslog.target - - [Service] -+EnvironmentFile=/etc/sysconfig/rasdaemon - ExecStart=@sbindir@/rasdaemon -f -r - ExecStartPost=@sbindir@/rasdaemon --enable - ExecStop=@sbindir@/rasdaemon --disable -diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in -index 82fae30..f5faffe 100644 ---- a/misc/rasdaemon.spec.in -+++ b/misc/rasdaemon.spec.in -@@ -36,12 +36,13 @@ an utility for reporting current error counts from the EDAC sysfs files. - %setup -q - - %build --%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm -+%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-ce-pfa - - make %{?_smp_mflags} - - %install - make install DESTDIR=%{buildroot} -+install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} - install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service - install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service - rm INSTALL %{buildroot}/usr/include/*.h -@@ -54,6 +55,7 @@ rm INSTALL %{buildroot}/usr/include/*.h - %{_unitdir}/*.service - %{_sharedstatedir}/rasdaemon - %{_sysconfdir}/ras/dimm_labels.d -+%config(noreplace) %{_sysconfdir}/sysconfig/%{name} - - %changelog - -diff --git a/ras-events.c b/ras-events.c -index 27ac1ab..5113c32 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -36,6 +36,7 @@ - #include "ras-memory-failure-handler.h" - #include "ras-record.h" - #include "ras-logger.h" -+#include "ras-page-isolation.h" - - /* - * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -673,6 +674,11 @@ int handle_ras_events(int record_events) - ras->page_size = page_size; - ras->record_events = record_events; - -+#ifdef HAVE_MEMORY_CE_PFA -+ /* FIXME: enable memory isolation unconditionally */ -+ ras_page_account_init(); -+#endif -+ - rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", - ras_mc_event_handler); - if (!rc) -diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index deb7e05..42b05cd 100644 ---- a/ras-mc-handler.c -+++ b/ras-mc-handler.c -@@ -23,6 +23,7 @@ - #include "ras-mc-handler.h" - #include "ras-record.h" - #include "ras-logger.h" -+#include "ras-page-isolation.h" - #include "ras-report.h" - - int ras_mc_event_handler(struct trace_seq *s, -@@ -183,6 +184,12 @@ int ras_mc_event_handler(struct trace_seq *s, - - ras_store_mc_event(ras, &ev); - -+#ifdef HAVE_MEMORY_CE_PFA -+ /* Account page corrected errors */ -+ if (!strcmp(ev.error_type, "Corrected")) -+ ras_record_page_error(ev.address, ev.error_count, now); -+#endif -+ - #ifdef HAVE_ABRT_REPORT - /* Report event to ABRT */ - ras_report_mc_event(ras, &ev); -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -new file mode 100644 -index 0000000..50e4406 ---- /dev/null -+++ b/ras-page-isolation.c -@@ -0,0 +1,332 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+*/ -+ -+#include -+#include -+#include -+#include -+#include -+#include "ras-logger.h" -+#include "ras-page-isolation.h" -+ -+#define PARSED_ENV_LEN 50 -+static const struct config threshold_units[] = { -+ { "m", 1000 }, -+ { "k", 1000 }, -+ { "", 1 }, -+ {} -+}; -+ -+static const struct config cycle_units[] = { -+ { "d", 24 }, -+ { "h", 60 }, -+ { "m", 60 }, -+ { "s", 1 }, -+ {} -+}; -+ -+static struct isolation threshold = { -+ .name = "PAGE_CE_THRESHOLD", -+ .units = threshold_units, -+ .env = "50", -+ .unit = "", -+}; -+ -+static struct isolation cycle = { -+ .name = "PAGE_CE_REFRESH_CYCLE", -+ .units = cycle_units, -+ .env = "24h", -+ .unit = "h", -+}; -+ -+static const char *kernel_offline[] = { -+ [OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page", -+ [OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page", -+ [OFFLINE_SOFT_THEN_HARD] = "/sys/devices/system/memory/soft_offline_page", -+}; -+ -+static const struct config offline_choice[] = { -+ { "off", OFFLINE_OFF }, -+ { "account", OFFLINE_ACCOUNT }, -+ { "soft", OFFLINE_SOFT }, -+ { "hard", OFFLINE_HARD }, -+ { "soft-then-hard", OFFLINE_SOFT_THEN_HARD }, -+ {} -+}; -+ -+static const char *page_state[] = { -+ [PAGE_ONLINE] = "online", -+ [PAGE_OFFLINE] = "offlined", -+ [PAGE_OFFLINE_FAILED] = "offline-failed", -+}; -+ -+static enum otype offline = OFFLINE_SOFT; -+static struct rb_root page_records; -+ -+static void page_offline_init(void) -+{ -+ const char *env = "PAGE_CE_ACTION"; -+ char *choice = getenv(env); -+ const struct config *c = NULL; -+ int matched = 0; -+ -+ if (choice) { -+ for (c = offline_choice; c->name; c++) { -+ if (!strcasecmp(choice, c->name)) { -+ offline = c->val; -+ matched = 1; -+ break; -+ } -+ } -+ } -+ -+ if (!matched) -+ log(TERM, LOG_INFO, "Improper %s, set to default soft\n", env); -+ -+ if (offline > OFFLINE_ACCOUNT && access(kernel_offline[offline], W_OK)) { -+ log(TERM, LOG_INFO, "Kernel does not support page offline interface\n"); -+ offline = OFFLINE_ACCOUNT; -+ } -+ -+ log(TERM, LOG_INFO, "Page offline choice on Corrected Errors is %s\n", -+ offline_choice[offline].name); -+} -+ -+static void parse_isolation_env(struct isolation *config) -+{ -+ char *env = getenv(config->name); -+ char *unit = NULL; -+ const struct config *units = NULL; -+ int i, no_unit; -+ int valid = 0; -+ int unit_matched = 0; -+ unsigned long value, tmp; -+ -+ /* check if env is vaild */ -+ if (env && strlen(env)) { -+ /* All the character before unit must be digit */ -+ for (i = 0; i < strlen(env) - 1; i++) { -+ if (!isdigit(env[i])) -+ goto parse; -+ } -+ if (sscanf(env, "%lu", &value) < 1 || !value) -+ goto parse; -+ /* check if the unit is vaild */ -+ unit = env + strlen(env) - 1; -+ /* no unit, all the character are value character */ -+ if (isdigit(*unit)) { -+ valid = 1; -+ no_unit = 1; -+ goto parse; -+ } -+ for (units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ valid = 1; -+ no_unit = 0; -+ break; -+ } -+ } -+ } -+ -+parse: -+ /* if invalid, use default env */ -+ if (valid) { -+ config->env = env; -+ if (!no_unit) -+ config->unit = unit; -+ } else { -+ log(TERM, LOG_INFO, "Improper %s, set to default %s.\n", -+ config->name, config->env); -+ } -+ -+ /* if env value string is greater than ulong_max, truncate the last digit */ -+ sscanf(config->env, "%lu", &value); -+ for (units = config->units; units->name; units++) { -+ if (!strcasecmp(config->unit, units->name)) -+ unit_matched = 1; -+ if (unit_matched) { -+ tmp = value; -+ value *= units->val; -+ if (tmp != 0 && value / tmp != units->val) -+ config->overflow = true; -+ } -+ } -+ config->val = value; -+ /* In order to output value and unit perfectly */ -+ config->unit = no_unit ? config->unit : ""; -+} -+ -+static void parse_env_string(struct isolation *config, char *str) -+{ -+ int i; -+ -+ if (config->overflow) { -+ /* when overflow, use basic unit */ -+ for (i = 0; config->units[i].name; i++) ; -+ sprintf(str, "%lu%s", config->val, config->units[i-1].name); -+ log(TERM, LOG_INFO, "%s is set overflow(%s), truncate it\n", -+ config->name, config->env); -+ } else { -+ sprintf(str, "%s%s", config->env, config->unit); -+ } -+} -+ -+static void page_isolation_init(void) -+{ -+ char threshold_string[PARSED_ENV_LEN]; -+ char cycle_string[PARSED_ENV_LEN]; -+ /** -+ * It's unnecessary to parse threshold configuration when offline -+ * choice is off. -+ */ -+ if (offline == OFFLINE_OFF) -+ return; -+ -+ parse_isolation_env(&threshold); -+ parse_isolation_env(&cycle); -+ parse_env_string(&threshold, threshold_string); -+ parse_env_string(&cycle, cycle_string); -+ log(TERM, LOG_INFO, "Threshold of memory Corrected Errors is %s / %s\n", -+ threshold_string, cycle_string); -+} -+ -+void ras_page_account_init(void) -+{ -+ page_offline_init(); -+ page_isolation_init(); -+} -+ -+static int do_page_offline(unsigned long long addr, enum otype type) -+{ -+ FILE *offline_file; -+ int err; -+ -+ offline_file = fopen(kernel_offline[type], "w"); -+ if (!offline_file) -+ return -1; -+ -+ fprintf(offline_file, "%#llx", addr); -+ err = ferror(offline_file) ? -1 : 0; -+ fclose(offline_file); -+ -+ return err; -+} -+ -+static void page_offline(struct page_record *pr) -+{ -+ unsigned long long addr = pr->addr; -+ int ret; -+ -+ /* Offlining page is not required */ -+ if (offline <= OFFLINE_ACCOUNT) -+ return; -+ -+ /* Ignore offlined pages */ -+ if (pr->offlined != PAGE_ONLINE) -+ return; -+ -+ /* Time to silence this noisy page */ -+ if (offline == OFFLINE_SOFT_THEN_HARD) { -+ ret = do_page_offline(addr, OFFLINE_SOFT); -+ if (ret < 0) -+ ret = do_page_offline(addr, OFFLINE_HARD); -+ } else { -+ ret = do_page_offline(addr, offline); -+ } -+ -+ pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; -+ -+ log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", -+ addr, page_state[pr->offlined]); -+} -+ -+static void page_record(struct page_record *pr, unsigned count, time_t time) -+{ -+ unsigned long period = time - pr->start; -+ unsigned long tolerate; -+ -+ if (period >= cycle.val) { -+ /** -+ * Since we don't refresh automatically, it is possible that the period -+ * between two occurences will be longer than the pre-configured refresh cycle. -+ * In this case, we tolerate the frequency of the whole period up to -+ * the pre-configured threshold. -+ */ -+ tolerate = (period / (double)cycle.val) * threshold.val; -+ pr->count -= (tolerate > pr->count) ? pr->count : tolerate; -+ pr->start = time; -+ pr->excess = 0; -+ } -+ -+ pr->count += count; -+ if (pr->count >= threshold.val) { -+ log(TERM, LOG_INFO, "Corrected Errors at %#llx exceeded threshold\n", pr->addr); -+ -+ /** -+ * Backup ce count of current cycle to enable next round, which actually -+ * should never happen if we can disable overflow completely in the same -+ * time unit (but sadly we can't). -+ */ -+ pr->excess += pr->count; -+ pr->count = 0; -+ page_offline(pr); -+ } -+} -+ -+static struct page_record *page_lookup_insert(unsigned long long addr) -+{ -+ struct rb_node **entry = &page_records.rb_node; -+ struct rb_node *parent = NULL; -+ struct page_record *pr = NULL, *find = NULL; -+ -+ while (*entry) { -+ parent = *entry; -+ pr = rb_entry(parent, struct page_record, entry); -+ if (addr == pr->addr) { -+ return pr; -+ } else if (addr < pr->addr) { -+ entry = &(*entry)->rb_left; -+ } else { -+ entry = &(*entry)->rb_right; -+ } -+ } -+ -+ find = calloc(1, sizeof(struct page_record)); -+ if (!find) { -+ log(TERM, LOG_ERR, "No memory for page records\n"); -+ return NULL; -+ } -+ -+ find->addr = addr; -+ rb_link_node(&find->entry, parent, entry); -+ rb_insert_color(&find->entry, &page_records); -+ -+ return find; -+} -+ -+void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) -+{ -+ struct page_record *pr = NULL; -+ -+ if (offline == OFFLINE_OFF) -+ return; -+ -+ pr = page_lookup_insert(addr & PAGE_MASK); -+ if (pr) { -+ if (!pr->start) -+ pr->start = time; -+ page_record(pr, count, time); -+ } -+} -diff --git a/ras-page-isolation.h b/ras-page-isolation.h -new file mode 100644 -index 0000000..3d03cef ---- /dev/null -+++ b/ras-page-isolation.h -@@ -0,0 +1,66 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+*/ -+ -+#ifndef __RAS_PAGE_ISOLATION_H -+#define __RAS_PAGE_ISOLATION_H -+ -+#include -+#include -+#include "rbtree.h" -+ -+#define PAGE_SHIFT 12 -+#define PAGE_SIZE (1 << PAGE_SHIFT) -+#define PAGE_MASK (~(PAGE_SIZE-1)) -+ -+struct config { -+ char *name; -+ unsigned long val; -+}; -+ -+enum otype { -+ OFFLINE_OFF, -+ OFFLINE_ACCOUNT, -+ OFFLINE_SOFT, -+ OFFLINE_HARD, -+ OFFLINE_SOFT_THEN_HARD, -+}; -+ -+enum pstate { -+ PAGE_ONLINE, -+ PAGE_OFFLINE, -+ PAGE_OFFLINE_FAILED, -+}; -+ -+struct page_record { -+ struct rb_node entry; -+ unsigned long long addr; -+ time_t start; -+ enum pstate offlined; -+ unsigned long count; -+ unsigned long excess; -+}; -+ -+struct isolation { -+ char *name; -+ char *env; -+ const struct config *units; -+ unsigned long val; -+ bool overflow; -+ char *unit; -+}; -+ -+void ras_page_account_init(void); -+void ras_record_page_error(unsigned long long addr, unsigned count, time_t time); -+ -+#endif --- -2.27.0 - diff --git a/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch b/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch deleted file mode 100644 index 9132d67..0000000 --- a/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch +++ /dev/null @@ -1,259 +0,0 @@ -From 07c3c72d18e5c7da2109b5afa918966733039f13 Mon Sep 17 00:00:00 2001 -From: Bixuan Cui -Date: Sun, 5 Jun 2022 02:10:24 +0800 -Subject: [PATCH] rasdaemon: Add notification support when page goes offline for Memory Corrected Error - -When the page goes offline, it may affect the user's processes. -The user needs to do some special actions (such as restarting the -process) before or after going offline. - -So add page-ce-offline-pre-notice and page-ce-offline-post-notice -to env file of rasdaemon for notifying the user when doing page -offline. - -Signed-off-by: Bixuan Cui ---- - Makefile.am | 2 +- - misc/notices/page-ce-offline-post-notice | 17 +++++ - misc/notices/page-ce-offline-pre-notice | 17 +++++ - misc/rasdaemon.env | 4 ++ - misc/rasdaemon.spec.in | 3 + - ras-page-isolation.c | 90 ++++++++++++++++++++++++ - 6 files changed, 132 insertions(+), 1 deletion(-) - create mode 100755 misc/notices/page-ce-offline-post-notice - create mode 100755 misc/notices/page-ce-offline-pre-notice - -diff --git a/Makefile.am b/Makefile.am -index de76301..701b120 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -1,6 +1,6 @@ - ACLOCAL_AMFLAGS=-I m4 - SUBDIRS = libtrace util man --SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env -+SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env misc/notices - SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) - EXTRA_DIST = $(SYSTEMD_SERVICES_IN) - -diff --git a/misc/notices/page-ce-offline-post-notice b/misc/notices/page-ce-offline-post-notice -new file mode 100755 -index 0000000..d78b1b0 ---- /dev/null -+++ b/misc/notices/page-ce-offline-post-notice -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon after a page goes offline. -+ -+cd `dirname $0` -+ -+[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1 -+ -+if [ -d page-ce-offline-post-notice.extern ] -+then -+ ls page-ce-offline-post-notice.extern | -+ while read item -+ do -+ [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1 -+ done -+fi -+ -+exit 0 -diff --git a/misc/notices/page-ce-offline-pre-notice b/misc/notices/page-ce-offline-pre-notice -new file mode 100755 -index 0000000..d1038a3 ---- /dev/null -+++ b/misc/notices/page-ce-offline-pre-notice -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon before a page goes offline. -+ -+cd `dirname $0` -+ -+[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1 -+ -+if [ -d page-ce-offline-pre-notice.extern ] -+then -+ ls page-ce-offline-pre-notice.extern | -+ while read item -+ do -+ [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1 -+ done -+fi -+ -+exit 0 -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..713875a 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -27,3 +27,7 @@ PAGE_CE_THRESHOLD="50" - # soft-then-hard First try to soft offline, then try hard offlining. - # Note: default offline choice is "soft". - PAGE_CE_ACTION="soft" -+ -+# Notices script when doing memory offline -+PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" -+PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" -diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in -index eff9794..f690575 100644 ---- a/misc/rasdaemon.spec.in -+++ b/misc/rasdaemon.spec.in -@@ -45,6 +45,8 @@ make install DESTDIR=%{buildroot} - install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} - install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service - install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service -+install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ -+install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ - rm INSTALL %{buildroot}/usr/include/*.h - - %files -@@ -56,6 +58,7 @@ rm INSTALL %{buildroot}/usr/include/*.h - %{_sharedstatedir}/rasdaemon - %{_sysconfdir}/ras/dimm_labels.d - %config(noreplace) %{_sysconfdir}/sysconfig/%{name} -+%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* - - %changelog - -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index 50e4406..f4f3bc1 100644 ---- a/ras-page-isolation.c -+++ b/ras-page-isolation.c -@@ -17,9 +17,13 @@ - #include - #include - #include -+#include -+#include -+#include - #include "ras-logger.h" - #include "ras-page-isolation.h" - -+#define MAX_PATH_LEN 64 - #define PARSED_ENV_LEN 50 - static const struct config threshold_units[] = { - { "m", 1000 }, -@@ -73,6 +77,8 @@ static const char *page_state[] = { - - static enum otype offline = OFFLINE_SOFT; - static struct rb_root page_records; -+static char pre_notice[MAX_PATH_LEN]; -+static char post_notice[MAX_PATH_LEN]; - - static void page_offline_init(void) - { -@@ -202,16 +208,94 @@ static void page_isolation_init(void) - threshold_string, cycle_string); - } - -+static void page_notice_init(void) -+{ -+ char *notice_root = "/etc/rasdaemon_notices"; -+ char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); -+ char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); -+ -+ if (offline <= OFFLINE_ACCOUNT) -+ return; -+ -+ snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); -+ if (access(pre_notice, R_OK|X_OK) < 0) -+ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", pre_notice); -+ -+ snprintf(post_notice, sizeof(post_notice), "%s/%s", notice_root, post_re); -+ if (access(post_notice, R_OK|X_OK) < 0) -+ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", post_notice); -+} -+ - void ras_page_account_init(void) - { - page_offline_init(); - page_isolation_init(); -+ page_notice_init(); -+} -+ -+static void finish_child(pid_t child, int status) -+{ -+ if (WIFEXITED(status) && WEXITSTATUS(status)) { -+ log(TERM, LOG_INFO, "notice exited with status %d\n", WEXITSTATUS(status)); -+ } else if (WIFSIGNALED(status)) { -+ log(TERM, LOG_INFO,"notice died with signal %s\n", strsignal(WTERMSIG(status))); -+ } -+ -+ return; -+} -+ -+static void __run_notice(char *argv[], char **env) -+{ -+ pid_t child; -+ int status; -+ -+ child = fork(); -+ if (child < 0) { -+ log(TERM, LOG_ERR, "Cannot create process for offline notice"); -+ return; -+ } -+ if (child == 0) { -+ execve(argv[0], argv, env); -+ _exit(127); -+ } -+ else { -+ waitpid(child, &status, 0); -+ finish_child(child, status); -+ } -+} -+ -+static void run_notice(char *argv[]) -+{ -+ int MAX_ENV = 20; -+ char *env[MAX_ENV]; -+ int ei = 0; -+ int i; -+ -+ asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin"); -+ env[ei] = NULL; -+ assert(ei < MAX_ENV); -+ -+ __run_notice(argv, env); -+ -+ for (i = 0; i < ei; i++) -+ free(env[i]); - } - - static int do_page_offline(unsigned long long addr, enum otype type) - { - FILE *offline_file; - int err; -+ char *args; -+ char *argv[] = { -+ NULL, -+ NULL, -+ NULL, -+ }; -+ -+ asprintf(&args, "%llu", addr); -+ argv[0] = (char*)&pre_notice; -+ argv[1] = args; -+ run_notice(argv); - - offline_file = fopen(kernel_offline[type], "w"); - if (!offline_file) -@@ -221,6 +305,11 @@ static int do_page_offline(unsigned long long addr, enum otype type) - err = ferror(offline_file) ? -1 : 0; - fclose(offline_file); - -+ argv[0] = (char*)&post_notice; -+ run_notice(argv); -+ -+ free(args); -+ - return err; - } - -@@ -329,4 +418,5 @@ void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) - pr->start = time; - page_record(pr, count, time); - } -+ - } --- -2.27.0 - diff --git a/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch b/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch deleted file mode 100644 index 852eb4f..0000000 --- a/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch +++ /dev/null @@ -1,85 +0,0 @@ -commit 0862a096c3a1d0f993703ab3299f1ddfadf53d7f -Author: Shiju Jose -Date: Tue Aug 11 13:31:46 2020 +0100 - - rasdaemon: ras-mc-ctl: Add ARM processor error information - - Add supporting ARM processor error in the ras-mc-ctl tool. - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - util/ras-mc-ctl.in | 40 ++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 40 insertions(+) - ---- rasdaemon-0.6.1.orig/util/ras-mc-ctl.in 2021-10-06 14:14:25.000440090 -0400 -+++ rasdaemon-0.6.1/util/ras-mc-ctl.in 2021-10-06 14:15:59.995598590 -0400 -@@ -1124,6 +1124,7 @@ sub summary - my ($query, $query_handle, $out); - my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg); - my ($etype, $severity, $etype_string, $severity_string); -+ my ($affinity, $mpidr); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1159,6 +1160,22 @@ sub summary - } - $query_handle->finish; - -+ # ARM processor arm_event errors -+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($affinity, $mpidr, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count errors\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events summary:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; -+ - # extlog errors - $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; - $query_handle = $dbh->prepare($query); -@@ -1202,6 +1219,7 @@ sub errors - my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); - my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); - my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); -+ my ($error_count, $affinity, $mpidr, $r_state, $psci_state); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1241,6 +1259,28 @@ sub errors - } - $query_handle->finish; - -+ # ARM processor arm_event errors -+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "error_count=$error_count, " if ($error_count); -+ $out .= "affinity_level=$affinity, "; -+ $out .= sprintf "mpidr=0x%x, ", $mpidr; -+ $out .= sprintf "running_state=0x%x, ", $r_state; -+ $out .= sprintf "psci_state=0x%x", $psci_state; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; -+ - # Extlog errors - $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; - $query_handle = $dbh->prepare($query); diff --git a/1001-rasdaemon-avoid-multiple-definitions.patch b/1001-rasdaemon-avoid-multiple-definitions.patch deleted file mode 100644 index 109587b..0000000 --- a/1001-rasdaemon-avoid-multiple-definitions.patch +++ /dev/null @@ -1,24 +0,0 @@ -commit fd982af0a307edc5d3e56011d2e045015b1efd4b -Author: Mauro Carvalho Chehab -Date: Mon Mar 30 01:22:24 2020 +0200 - - ras-record.h: define an external var as such - - Otherwise, newer versions of gcc will produce multiple symbols, - causing link breakages. - - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/ras-record.h b/ras-record.h -index 5311c67caf44..0d2a481c23dd 100644 ---- a/ras-record.h -+++ b/ras-record.h -@@ -25,7 +25,7 @@ - - extern long user_hz; - --struct ras_events *ras; -+extern struct ras_events *ras; - - struct ras_mc_event { - char timestamp[64]; diff --git a/16d929b024c31d54a7f8a72eab094376c7be27f5.patch b/16d929b024c31d54a7f8a72eab094376c7be27f5.patch deleted file mode 100644 index ab66f52..0000000 --- a/16d929b024c31d54a7f8a72eab094376c7be27f5.patch +++ /dev/null @@ -1,32 +0,0 @@ -commit 16d929b024c31d54a7f8a72eab094376c7be27f5 -Author: Mauro Carvalho Chehab -Date: Wed May 26 10:20:39 2021 +0200 - - Makefile.am: fix build header rules - - non-standard-hisilicon.h was added twice; - ras-memory-failure-handler.h is missing. - - Due to that, the tarball becomes incomplete, causing build - errors. - - While here, also adjust .travis.yml to use --enable-all. - - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - ---- a/Makefile.am 2021-10-13 13:27:53.402685179 -0400 -+++ b/Makefile.am 2021-10-13 13:28:11.664525173 -0400 -@@ -54,7 +54,8 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LI - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ -- ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h -+ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ -+ ras-memory-failure-handler.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that diff --git a/1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch b/1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch new file mode 100644 index 0000000..99a9ba6 --- /dev/null +++ b/1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch @@ -0,0 +1,32 @@ +commit 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4 +Author: Matt Whitlock +Date: Wed Jun 9 10:25:18 2021 -0400 + + configure.ac: fix SYSCONFDEFDIR default value + + configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like: + + # Check whether --with-sysconfdefdir was given. + if test "${with_sysconfdefdir+set}" = set; then : + withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval + else + "/etc/sysconfig" + fi + + This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command. + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/configure.ac b/configure.ac +index f7d1947..33b81fe 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR]) + AC_ARG_WITH(sysconfdefdir, + AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]), + [SYSCONFDEFDIR=$withval], +- ["/etc/sysconfig"]) ++ [SYSCONFDEFDIR=/etc/sysconfig]) + AC_SUBST([SYSCONFDEFDIR]) + + AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database]) diff --git a/2290d65b97311dd5736838f1e285355f7f357046.patch b/2290d65b97311dd5736838f1e285355f7f357046.patch deleted file mode 100644 index 0710974..0000000 --- a/2290d65b97311dd5736838f1e285355f7f357046.patch +++ /dev/null @@ -1,538 +0,0 @@ -commit 2290d65b97311dd5736838f1e285355f7f357046 -Author: Shiju Jose -Date: Mon Mar 8 16:57:26 2021 +0000 - - rasdaemon: add support for memory_failure events - - Add support to log the memory_failure kernel trace - events. - - Example rasdaemon log and SQLite DB output for the - memory_failure event, - ================================================= - rasdaemon: memory_failure_event store: 0x126ce8f8 - rasdaemon: register inserted at db - <...>-785 [000] 0.000024: memory_failure_event: 2020-10-02 13:27:13 -0400 pfn=0x204000000 page_type=free buddy page action_result=Delayed - - CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp TEXT, pfn TEXT, page_type TEXT, action_result TEXT); - INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13 -0400','0x204000000','free buddy page','Delayed'); - ================================================== - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 4 - ras-events.c | 15 +++ - ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++++++++++ - ras-memory-failure-handler.h | 25 ++++++ - ras-record.c | 56 +++++++++++++ - ras-record.h | 13 +++ - ras-report.c | 68 ++++++++++++++++ - ras-report.h | 5 - - 8 files changed, 364 insertions(+), 1 deletion(-) - ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ b/ras-memory-failure-handler.c 2021-10-14 16:31:36.840657728 -0400 -@@ -0,0 +1,179 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+#include -+#include "libtrace/kbuffer.h" -+#include "ras-memory-failure-handler.h" -+#include "ras-record.h" -+#include "ras-logger.h" -+#include "ras-report.h" -+ -+/* Memory failure - various types of pages */ -+enum mf_action_page_type { -+ MF_MSG_KERNEL, -+ MF_MSG_KERNEL_HIGH_ORDER, -+ MF_MSG_SLAB, -+ MF_MSG_DIFFERENT_COMPOUND, -+ MF_MSG_POISONED_HUGE, -+ MF_MSG_HUGE, -+ MF_MSG_FREE_HUGE, -+ MF_MSG_NON_PMD_HUGE, -+ MF_MSG_UNMAP_FAILED, -+ MF_MSG_DIRTY_SWAPCACHE, -+ MF_MSG_CLEAN_SWAPCACHE, -+ MF_MSG_DIRTY_MLOCKED_LRU, -+ MF_MSG_CLEAN_MLOCKED_LRU, -+ MF_MSG_DIRTY_UNEVICTABLE_LRU, -+ MF_MSG_CLEAN_UNEVICTABLE_LRU, -+ MF_MSG_DIRTY_LRU, -+ MF_MSG_CLEAN_LRU, -+ MF_MSG_TRUNCATED_LRU, -+ MF_MSG_BUDDY, -+ MF_MSG_BUDDY_2ND, -+ MF_MSG_DAX, -+ MF_MSG_UNSPLIT_THP, -+ MF_MSG_UNKNOWN, -+}; -+ -+/* Action results for various types of pages */ -+enum mf_action_result { -+ MF_IGNORED, /* Error: cannot be handled */ -+ MF_FAILED, /* Error: handling failed */ -+ MF_DELAYED, /* Will be handled later */ -+ MF_RECOVERED, /* Successfully recovered */ -+}; -+ -+/* memory failure page types */ -+static const struct { -+ int type; -+ const char *page_type; -+} mf_page_type[] = { -+ { MF_MSG_KERNEL, "reserved kernel page" }, -+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"}, -+ { MF_MSG_SLAB, "kernel slab page"}, -+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"}, -+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"}, -+ { MF_MSG_HUGE, "huge page"}, -+ { MF_MSG_FREE_HUGE, "free huge page"}, -+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"}, -+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"}, -+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"}, -+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"}, -+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"}, -+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"}, -+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"}, -+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"}, -+ { MF_MSG_DIRTY_LRU, "dirty LRU page"}, -+ { MF_MSG_CLEAN_LRU, "clean LRU page"}, -+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"}, -+ { MF_MSG_BUDDY, "free buddy page"}, -+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"}, -+ { MF_MSG_DAX, "dax page"}, -+ { MF_MSG_UNSPLIT_THP, "unsplit thp"}, -+ { MF_MSG_UNKNOWN, "unknown page"}, -+}; -+ -+/* memory failure action results */ -+static const struct { -+ int result; -+ const char *action_result; -+} mf_action_result[] = { -+ { MF_IGNORED, "Ignored" }, -+ { MF_FAILED, "Failed" }, -+ { MF_DELAYED, "Delayed" }, -+ { MF_RECOVERED, "Recovered" }, -+}; -+ -+static const char *get_page_type(int page_type) -+{ -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++) -+ if (mf_page_type[i].type == page_type) -+ return mf_page_type[i].page_type; -+ -+ return "unknown page"; -+} -+ -+static const char *get_action_result(int result) -+{ -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++) -+ if (mf_action_result[i].result == result) -+ return mf_action_result[i].action_result; -+ -+ return "unknown"; -+} -+ -+ -+int ras_memory_failure_event_handler(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, void *context) -+{ -+ unsigned long long val; -+ struct ras_events *ras = context; -+ time_t now; -+ struct tm *tm; -+ struct ras_mf_event ev; -+ -+ /* -+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. -+ * On previous kernels, the way to properly generate an event would -+ * be to inject a fake one, measure its timestamp and diff it against -+ * gettimeofday. We won't do it here. Instead, let's use uptime, -+ * falling-back to the event report's time, if "uptime" clock is -+ * not available (legacy kernels). -+ */ -+ -+ if (ras->use_uptime) -+ now = record->ts/user_hz + ras->uptime_diff; -+ else -+ now = time(NULL); -+ -+ tm = localtime(&now); -+ if (tm) -+ strftime(ev.timestamp, sizeof(ev.timestamp), -+ "%Y-%m-%d %H:%M:%S %z", tm); -+ trace_seq_printf(s, "%s ", ev.timestamp); -+ -+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0) -+ return -1; -+ sprintf(ev.pfn, "0x%llx", val); -+ trace_seq_printf(s, "pfn=0x%llx ", val); -+ -+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0) -+ return -1; -+ ev.page_type = get_page_type(val); -+ trace_seq_printf(s, "page_type=%s ", ev.page_type); -+ -+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0) -+ return -1; -+ ev.action_result = get_action_result(val); -+ trace_seq_printf(s, "action_result=%s ", ev.action_result); -+ -+ /* Store data into the SQLite DB */ -+#ifdef HAVE_SQLITE3 -+ ras_store_mf_event(ras, &ev); -+#endif -+ -+#ifdef HAVE_ABRT_REPORT -+ /* Report event to ABRT */ -+ ras_report_mf_event(ras, &ev); -+#endif -+ -+ return 0; -+} ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ b/ras-memory-failure-handler.h 2021-10-14 16:31:36.840657728 -0400 -@@ -0,0 +1,25 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+*/ -+ -+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H -+#define __RAS_MEMORY_FAILURE_HANDLER_H -+ -+#include "ras-events.h" -+#include "libtrace/event-parse.h" -+ -+int ras_memory_failure_event_handler(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, void *context); -+ -+#endif ---- a/ras-record.c 2018-04-25 06:19:03.000000000 -0400 -+++ b/ras-record.c 2021-10-14 16:31:36.840657728 -0400 -@@ -404,6 +404,55 @@ sqlite3_bind_text(priv->stmt_mce_record, - } - #endif - -+/* -+ * Table and functions to handle ras:memory_failure -+ */ -+ -+#ifdef HAVE_MEMORY_FAILURE -+static const struct db_fields mf_event_fields[] = { -+ { .name="id", .type="INTEGER PRIMARY KEY" }, -+ { .name="timestamp", .type="TEXT" }, -+ { .name="pfn", .type="TEXT" }, -+ { .name="page_type", .type="TEXT" }, -+ { .name="action_result", .type="TEXT" }, -+}; -+ -+static const struct db_table_descriptor mf_event_tab = { -+ .name = "memory_failure_event", -+ .fields = mf_event_fields, -+ .num_fields = ARRAY_SIZE(mf_event_fields), -+}; -+ -+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) -+{ -+ int rc; -+ struct sqlite3_priv *priv = ras->db_priv; -+ -+ if (!priv || !priv->stmt_mf_event) -+ return 0; -+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event); -+ -+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL); -+ -+ rc = sqlite3_step(priv->stmt_mf_event); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc); -+ -+ rc = sqlite3_reset(priv->stmt_mf_event); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed reset memory_failure_event on sqlite: error = %d\n", -+ rc); -+ -+ log(TERM, LOG_INFO, "register inserted at db\n"); -+ -+ return rc; -+} -+#endif - - /* - * Generic code -@@ -567,6 +616,13 @@ usleep(10000); - rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record, - &arm_event_tab); - #endif -+#ifdef HAVE_MEMORY_FAILURE -+ rc = ras_mc_create_table(priv, &mf_event_tab); -+ if (rc == SQLITE_OK) { -+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event, -+ &mf_event_tab); -+ } -+#endif - - ras->db_priv = priv; - return 0; ---- a/ras-record.h 2018-04-25 06:19:03.000000000 -0400 -+++ b/ras-record.h 2021-10-14 16:31:36.840657728 -0400 -@@ -75,12 +75,20 @@ struct ras_arm_event { - int32_t psci_state; - }; - -+struct ras_mf_event { -+ char timestamp[64]; -+ char pfn[30]; -+ const char *page_type; -+ const char *action_result; -+}; -+ - struct ras_mc_event; - struct ras_aer_event; - struct ras_extlog_event; - struct ras_non_standard_event; - struct ras_arm_event; - struct mce_event; -+struct ras_mf_event; - - #ifdef HAVE_SQLITE3 - -@@ -104,6 +112,9 @@ struct sqlite3_priv { - #ifdef HAVE_ARM - sqlite3_stmt *stmt_arm_record; - #endif -+#ifdef HAVE_MEMORY_FAILURE -+ sqlite3_stmt *stmt_mf_event; -+#endif - }; - - int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); -@@ -113,6 +124,7 @@ int ras_store_mce_record(struct ras_even - int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev); - int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev); - int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev); -+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); - - #else - static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; -@@ -122,6 +134,7 @@ static inline int ras_store_mce_record(s - static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; }; - static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; - static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; -+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; - - #endif - ---- a/ras-report.c 2017-10-14 05:11:34.000000000 -0400 -+++ b/ras-report.c 2021-10-14 16:31:36.840657728 -0400 -@@ -255,6 +255,28 @@ "midr=0x%lx\n" \ - return 0; - } - -+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev) -+{ -+ char bt_buf[MAX_BACKTRACE_SIZE]; -+ -+ if (!buf || !ev) -+ return -1; -+ -+ sprintf(bt_buf, "BACKTRACE=" \ -+ "timestamp=%s\n" \ -+ "pfn=%s\n" \ -+ "page_type=%s\n" \ -+ "action_result=%s\n", \ -+ ev->timestamp, \ -+ ev->pfn, \ -+ ev->page_type, \ -+ ev->action_result); -+ -+ strcat(buf, bt_buf); -+ -+ return 0; -+} -+ - static int commit_report_backtrace(int sockfd, int type, void *ev){ - char buf[MAX_BACKTRACE_SIZE]; - char *pbuf = buf; -@@ -283,6 +305,9 @@ memset(buf, 0, MAX_BACKTRACE_SIZE); - case ARM_EVENT: - rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev); - break; -+ case MF_EVENT: -+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev); -+ break; - default: - return -1; - } -@@ -549,3 +574,46 @@ return 0; - return -1; - } - } -+ -+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) -+{ -+ char buf[MAX_MESSAGE_SIZE]; -+ int sockfd = 0; -+ int done = 0; -+ int rc = -1; -+ -+ memset(buf, 0, sizeof(buf)); -+ -+ sockfd = setup_report_socket(); -+ if (sockfd < 0) -+ return -1; -+ -+ rc = commit_report_basic(sockfd); -+ if (rc < 0) -+ goto mf_fail; -+ -+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev); -+ if (rc < 0) -+ goto mf_fail; -+ -+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure"); -+ rc = write(sockfd, buf, strlen(buf) + 1); -+ if (rc < strlen(buf) + 1) -+ goto mf_fail; -+ -+ sprintf(buf, "REASON=%s", "memory failure problem"); -+ rc = write(sockfd, buf, strlen(buf) + 1); -+ if (rc < strlen(buf) + 1) -+ goto mf_fail; -+ -+ done = 1; -+ -+mf_fail: -+ if (sockfd > 0) -+ close(sockfd); -+ -+ if (done) -+ return 0; -+ else -+ return -1; -+} ---- a/ras-report.h 2017-10-14 05:11:34.000000000 -0400 -+++ b/ras-report.h 2021-10-14 16:31:36.840657728 -0400 -@@ -34,7 +34,8 @@ enum { - MCE_EVENT, - AER_EVENT, - NON_STANDARD_EVENT, -- ARM_EVENT -+ ARM_EVENT, -+ MF_EVENT, - }; - - #ifdef HAVE_ABRT_REPORT -@@ -44,6 +45,7 @@ int ras_report_aer_event(struct ras_even - int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev); - int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev); - int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev); -+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); - - #else - -@@ -52,6 +54,7 @@ static inline int ras_report_aer_event(s - static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; }; - static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; - static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; -+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; - - #endif - ---- a/Makefile.am 2018-04-25 06:21:56.000000000 -0400 -+++ b/Makefile.am 2021-10-14 16:37:42.423639762 -0400 -@@ -41,12 +41,16 @@ endif - if WITH_EXTLOG - rasdaemon_SOURCES += ras-extlog-handler.c - endif -+if WITH_MEMORY_FAILURE -+ rasdaemon_SOURCES += ras-memory-failure-handler.c -+endif - if WITH_ABRT_REPORT - rasdaemon_SOURCES += ras-report.c - endif - if WITH_HISI_NS_DECODE - rasdaemon_SOURCES += non-standard-hisi_hip07.c - endif -+ - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ---- a/ras-events.c 2021-10-14 16:31:36.730658636 -0400 -+++ b/ras-events.c 2021-10-14 16:37:11.043898809 -0400 -@@ -33,6 +33,7 @@ * Foundation, Inc., 51 Franklin Street, - #include "ras-arm-handler.h" - #include "ras-mce-handler.h" - #include "ras-extlog-handler.h" -+#include "ras-memory-failure-handler.h" - #include "ras-record.h" - #include "ras-logger.h" - -@@ -218,6 +219,10 @@ if (rc < 0) { - rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable); - #endif - -+#ifdef HAVE_MEMORY_FAILURE -+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable); -+#endif -+ - free_ras: - free(ras); - return rc; -@@ -736,6 +741,16 @@ (void)open("/sys/kernel/debug/ras/daemon - "ras", "aer_event"); - #endif - -+#ifdef HAVE_MEMORY_FAILURE -+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event", -+ ras_memory_failure_event_handler); -+ if (!rc) -+ num_events++; -+ else -+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", -+ "ras", "memory_failure_event"); -+#endif -+ - if (!num_events) { - log(ALL, LOG_INFO, - "Failed to trace all supported RAS events. Aborting.\n"); diff --git a/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch b/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch deleted file mode 100644 index 1b5844d..0000000 --- a/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch +++ /dev/null @@ -1,66 +0,0 @@ -commit 2a1d217660351c08eb2f8bccebf939abba2f7e69 -Author: Brian WoodsGhannam, Yazen -Date: Fri Nov 1 15:48:13 2019 +0100 - - rasdaemon: rename CPU_NAPLES cputype - - Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES - that is supported, but AMD's Scalable Machine Check Architecture (SMCA). - - [ Yazen: change family check to feature check, and change CPU name. ] - - CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype - Signed-off-by: Brian Woods - Signed-off-by: Yazen Ghannam - Cc: Chandu-babu Namburu - Signed-off-by: Mauro Carvalho Chehab - ---- - ras-mce-handler.c | 10 ++++++---- - ras-mce-handler.h | 2 +- - 2 files changed, 7 insertions(+), 5 deletions(-) - ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400 -@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_LANDING] = "Knights Landing", - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", -- [CPU_NAPLES] = "AMD Family 17h Zen1" -+ [CPU_AMD_SMCA] = "AMD Scalable MCA", - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -191,8 +191,10 @@ ret = 0; - if (!strcmp(mce->vendor, "AuthenticAMD")) { - if (mce->family == 15) - mce->cputype = CPU_K8; -- if (mce->family == 23) -- mce->cputype = CPU_NAPLES; -+ if (strstr(mce->processor_flags, "smca")) { -+ mce->cputype = CPU_AMD_SMCA; -+ goto ret; -+ } - if (mce->family > 23) { - log(ALL, LOG_INFO, - "Can't parse MCE for this AMD CPU yet %d\n", -@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid - case CPU_K8: - rc = parse_amd_k8_event(ras, &e); - break; -- case CPU_NAPLES: -+ case CPU_AMD_SMCA: - rc = parse_amd_smca_event(ras, &e); - break; - default: /* All other CPU types are Intel */ ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400 -@@ -50,7 +50,7 @@ enum cputype { - CPU_KNIGHTS_LANDING, - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, -- CPU_NAPLES, -+ CPU_AMD_SMCA, - }; - - struct mce_event { diff --git a/2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch b/2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch new file mode 100644 index 0000000..eb45db0 --- /dev/null +++ b/2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch @@ -0,0 +1,63 @@ +commit 2b37a26dcec389723f75d69d3da9c2f15f6c317d +Author: Mauro Carvalho Chehab +Date: Wed May 26 12:41:27 2021 +0200 + + ci.yml: Fix the job for it to run on a single arch + + There were some issues on the previous content. Fix them, in + order to allow it to build on a single architecture. + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml +index 5b3e757..747a844 100644 +--- a/.github/workflows/ci.yml ++++ b/.github/workflows/ci.yml +@@ -1,34 +1,23 @@ + name: CI + +-# Should run only on branches and PR, as "on_tag.yml" will handle tags + on: ++ workflow_dispatch: + push: +- branches: master test + pull_request: +- branches: master + + jobs: +- +-# +-# Linux +-# + Ubuntu: + name: Ubuntu +- runs-on: ubuntu-20.04 +- strategy: +- matrix: +- arch: [x64_64, aarch64, armv7, ppc64le] ++ runs-on: ubuntu-latest + steps: +- - uses: actions/checkout@v2 +- with: +- arch: ${{ matrix.arch }} +- - name: prepare +- run: | +- sudo apt-get update +- sudo apt-get install -y build-essential sqlite3 +- - name: build +- run: | +- autoreconf -vfi +- ./configure --enable-all +- make +- sudo make install ++ - uses: actions/checkout@v2 ++ - name: prepare ++ run: | ++ sudo apt-get update ++ sudo apt-get install -y build-essential sqlite3 ++ - name: build ++ run: | ++ autoreconf -vfi ++ ./configure --enable-all ++ make ++ sudo make install diff --git a/2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch b/2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch new file mode 100644 index 0000000..c2a9376 --- /dev/null +++ b/2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch @@ -0,0 +1,44 @@ +commit 2b6a54b0d31e02e657171fd27f4e31d996756bc6 +Author: DmNosachev +Date: Thu Jul 22 10:25:38 2021 +0300 + + labels/supermicro: added Supermicro X10DRL, X11SPM + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 1e7761f..990fc9e 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -88,6 +88,16 @@ Vendor: Supermicro + P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1; + P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1; + P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; ++ ++ Model: X10DRL-i ++ P1-DIMMA1: 0.0.0; ++ P1-DIMMB1: 0.1.0; ++ P1-DIMMC1: 0.2.0; ++ P1-DIMMD1: 0.3.0; ++ P2-DIMME1: 1.0.0; ++ P2-DIMMF1: 1.1.0; ++ P2-DIMMG1: 1.2.0; ++ P2-DIMMH1: 1.3.0; + + Model: X11DDW-NT, X11DDW-L + P1-DIMMA1: 0.0.0; +@@ -102,6 +112,14 @@ Vendor: Supermicro + P2-DIMMD1: 3.0.0; + P2-DIMME1: 3.1.0; + P2-DIMMF1: 3.2.0; ++ ++ Model: X11SPM-F, X11SPM-TF, X11SPM-TPF ++ DIMMA1: 0.0.0; ++ DIMMB1: 0.1.0; ++ DIMMC1: 0.2.0; ++ DIMMD1: 1.0.0; ++ DIMME1: 1.1.0; ++ DIMMF1: 1.2.0; + + Model: B1DRi + P1_DIMMA1: 0.0.0; diff --git a/50565005b10fe909c66f1c90f2feb95712427c7d.patch b/50565005b10fe909c66f1c90f2feb95712427c7d.patch new file mode 100644 index 0000000..dba0116 --- /dev/null +++ b/50565005b10fe909c66f1c90f2feb95712427c7d.patch @@ -0,0 +1,43 @@ +commit 50565005b10fe909c66f1c90f2feb95712427c7d +Author: DmNosachev +Date: Tue Jun 29 14:07:54 2021 +0300 + + labels/supermicro: added Supermicro X11DDW-NT(-L) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 86e4617..373de07 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -69,7 +69,7 @@ Vendor: Supermicro + P2_DIMM4B: 2.0.1; + P2_DIMM4B: 2.1.1; + +- Model: X11DPH-i ++ Model: X11DPH-i, X11DPH-T, X11DPH-TQ + P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; + P1-DIMMB1: 0.1.0; + P1-DIMMC1: 0.2.0; +@@ -91,4 +91,18 @@ Vendor: Supermicro + P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1; + P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1; + P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1; +- P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; +\ No newline at end of file ++ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; ++ ++ Model: X11DDW-NT, X11DDW-L ++ P1-DIMMA1: 0.0.0; ++ P1-DIMMB1: 0.1.0; ++ P1-DIMMC1: 0.2.0; ++ P1-DIMMD1: 1.0.0; ++ P1-DIMME1: 1.1.0; ++ P1-DIMMF1: 1.2.0; ++ P2-DIMMA1: 2.0.0; ++ P2-DIMMB1: 2.1.0; ++ P2-DIMMC1: 2.2.0; ++ P2-DIMMD1: 3.0.0; ++ P2-DIMME1: 3.1.0; ++ P2-DIMMF1: 3.2.0; +\ No newline at end of file diff --git a/546cf713f667437fb6e283cc3dc090679eb47d08.patch b/546cf713f667437fb6e283cc3dc090679eb47d08.patch deleted file mode 100644 index 448b1f6..0000000 --- a/546cf713f667437fb6e283cc3dc090679eb47d08.patch +++ /dev/null @@ -1,372 +0,0 @@ -commit 546cf713f667437fb6e283cc3dc090679eb47d08 -Author: Subhendu Saha -Date: Tue Jan 12 03:29:55 2021 -0500 - - Fix ras-mc-ctl script. - - When rasdaemon is compiled without enabling aer, mce, devlink, - etc., those tables are not created in the database file. Then - ras-mc-ctl script breaks trying to query data from non-existent - tables. - - Signed-off-by: Subhendu Saha subhends@akamai.com - Signed-off-by: Mauro Carvalho Chehab - ---- - util/ras-mc-ctl.in | 310 ++++++++++++++++++++++++++++------------------------- - 1 file changed, 168 insertions(+), 142 deletions(-) - ---- a/util/ras-mc-ctl.in 2021-10-12 13:45:43.260646935 -0400 -+++ b/util/ras-mc-ctl.in 2021-10-12 13:46:38.610158949 -0400 -@@ -41,6 +41,16 @@ my $sysconfdir = "@sysconfdir@"; - my $dmidecode = find_prog ("dmidecode"); - my $modprobe = find_prog ("modprobe") or exit (1); - -+my $has_aer = 0; -+my $has_arm = 0; -+my $has_extlog = 0; -+my $has_mce = 0; -+ -+@WITH_AER_TRUE@$has_aer = 1; -+@WITH_ARM_TRUE@$has_arm = 1; -+@WITH_EXTLOG_TRUE@$has_extlog = 1; -+@WITH_MCE_TRUE@$has_mce = 1; -+ - my %conf = (); - my %bus = (); - my %dimm_size = (); -@@ -1145,70 +1155,78 @@ sub summary - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($err_type, $msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $err_type errors: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events summary:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($err_type, $msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $err_type errors: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events summary:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # ARM processor arm_event errors -- $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($affinity, $mpidr, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count errors\n"; -- } -- if ($out ne "") { -- print "ARM processor events summary:\n$out\n"; -- } else { -- print "No ARM processor errors.\n\n"; -+ if ($has_arm == 1) { -+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($affinity, $mpidr, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count errors\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events summary:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # extlog errors -- $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($etype, $severity, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "\t$count $etype_string $severity_string errors\n"; -- } -- if ($out ne "") { -- print "Extlog records summary:\n$out"; -- } else { -- print "No Extlog errors.\n"; -+ if ($has_extlog == 1) { -+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($etype, $severity, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "\t$count $etype_string $severity_string errors\n"; -+ } -+ if ($out ne "") { -+ print "Extlog records summary:\n$out"; -+ } else { -+ print "No Extlog errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select error_msg, count(*) from mce_record group by error_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $msg errors\n"; -- } -- if ($out ne "") { -- print "MCE records summary:\n$out"; -- } else { -- print "No MCE errors.\n"; -+ if ($has_mce == 1) { -+ $query = "select error_msg, count(*) from mce_record group by error_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $msg errors\n"; -+ } -+ if ($out ne "") { -+ print "MCE records summary:\n$out"; -+ } else { -+ print "No MCE errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } -@@ -1244,105 +1262,113 @@ sub errors - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $type, $msg)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time $type error: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $type, $msg)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time $type error: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # ARM processor arm_event errors -- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $timestamp error: "; -- $out .= "error_count=$error_count, " if ($error_count); -- $out .= "affinity_level=$affinity, "; -- $out .= sprintf "mpidr=0x%x, ", $mpidr; -- $out .= sprintf "running_state=0x%x, ", $r_state; -- $out .= sprintf "psci_state=0x%x", $psci_state; -- $out .= "\n"; -- } -- if ($out ne "") { -- print "ARM processor events:\n$out\n"; -- } else { -- print "No ARM processor errors.\n\n"; -+ if ($has_arm == 1) { -+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "error_count=$error_count, " if ($error_count); -+ $out .= "affinity_level=$affinity, "; -+ $out .= sprintf "mpidr=0x%x, ", $mpidr; -+ $out .= sprintf "running_state=0x%x, ", $r_state; -+ $out .= sprintf "psci_state=0x%x", $psci_state; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # Extlog errors -- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "$id $timestamp error: "; -- $out .= "type=$etype_string, "; -- $out .= "severity=$severity_string, "; -- $out .= sprintf "address=0x%08x, ", $addr; -- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -- $out .= "fru_text='$fru_text', "; -- $out .= get_cper_data_text($cper_data) if ($cper_data); -- $out .= "\n"; -- } -- if ($out ne "") { -- print "Extlog events:\n$out\n"; -- } else { -- print "No Extlog errors.\n\n"; -+ if ($has_extlog) { -+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "$id $timestamp error: "; -+ $out .= "type=$etype_string, "; -+ $out .= "severity=$severity_string, "; -+ $out .= sprintf "address=0x%08x, ", $addr; -+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -+ $out .= "fru_text='$fru_text', "; -+ $out .= get_cper_data_text($cper_data) if ($cper_data); -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "Extlog events:\n$out\n"; -+ } else { -+ print "No Extlog errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time error: $msg"; -- $out .= ", CPU $cpuvendor" if ($cpuvendor); -- $out .= ", bank $bank_name" if ($bank_name); -- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -- $out .= ", $mc_location" if ($mc_location); -- $out .= ", $user_action" if ($user_action); -- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -- $out .= sprintf ", status=0x%08x", $status if ($status); -- $out .= sprintf ", addr=0x%08x", $addr if ($addr); -- $out .= sprintf ", misc=0x%08x", $misc if ($misc); -- $out .= sprintf ", ip=0x%08x", $ip if ($ip); -- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -- $out .= sprintf ", cs=0x%08x", $cs if ($cs); -- $out .= sprintf ", bank=0x%08x", $bank if ($bank); -+ if ($has_mce == 1) { -+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time error: $msg"; -+ $out .= ", CPU $cpuvendor" if ($cpuvendor); -+ $out .= ", bank $bank_name" if ($bank_name); -+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -+ $out .= ", $mc_location" if ($mc_location); -+ $out .= ", $user_action" if ($user_action); -+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -+ $out .= sprintf ", status=0x%08x", $status if ($status); -+ $out .= sprintf ", addr=0x%08x", $addr if ($addr); -+ $out .= sprintf ", misc=0x%08x", $misc if ($misc); -+ $out .= sprintf ", ip=0x%08x", $ip if ($ip); -+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -+ $out .= sprintf ", cs=0x%08x", $cs if ($cs); -+ $out .= sprintf ", bank=0x%08x", $bank if ($bank); - -- $out .= "\n"; -- } -- if ($out ne "") { -- print "MCE events:\n$out\n"; -- } else { -- print "No MCE errors.\n\n"; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "MCE events:\n$out\n"; -+ } else { -+ print "No MCE errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } diff --git a/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch b/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch deleted file mode 100644 index 57a4e46..0000000 --- a/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch +++ /dev/null @@ -1,149 +0,0 @@ -commit 60a91e4da4f2daf2b10143fc148a8043312b61e5 -Author: Aristeu Rozanski -Date: Wed Aug 1 16:29:58 2018 -0400 - - rasdaemon: ras-mc-ctl: add option to show error counts - - In some scenarios it might not be desirable to have a daemon running - to parse and store the errors provided by EDAC and only having the - number of CEs and UEs is enough. This patch implements this feature - as an ras-mc-ctl option. - - Signed-off-by: Aristeu Rozanski - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 38b7824..aee431a 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -50,6 +50,8 @@ my %dimm_location = (); - my %csrow_size = (); - my %rank_size = (); - my %csrow_ranks = (); -+my %dimm_ce_count = (); -+my %dimm_ue_count = (); - - my @layers; - my @max_pos; -@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...] - --layout Display the memory layout. - --summary Presents a summary of the logged errors. - --errors Shows the errors stored at the error database. -+ --error-count Shows the corrected and uncorrected error counts using sysfs. - --help This help message. - EOF - -@@ -83,7 +86,7 @@ parse_cmdline(); - - if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} - || $conf{opt}{register_labels} || $conf{opt}{display_memory_layout} -- || $conf{opt}{guess_dimm_label}) { -+ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) { - - get_mainboard_info(); - -@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} - if ($conf{opt}{guess_dimm_label}) { - guess_dimm_label (); - } -+ if ($conf{opt}{error_count}) { -+ display_error_count (); -+ } - } - - if ($conf{opt}{status}) { -@@ -134,6 +140,7 @@ sub parse_cmdline - $conf{opt}{guess_dimm_label} = 0; - $conf{opt}{summary} = 0; - $conf{opt}{errors} = 0; -+ $conf{opt}{error_count} = 0; - - my $rref = \$conf{opt}{report}; - my $mref = \$conf{opt}{mainboard}; -@@ -150,7 +157,8 @@ sub parse_cmdline - "status" => \$conf{opt}{status}, - "layout" => \$conf{opt}{display_memory_layout}, - "summary" => \$conf{opt}{summary}, -- "errors" => \$conf{opt}{errors} -+ "errors" => \$conf{opt}{errors}, -+ "error-count" => \$conf{opt}{error_count} - ); - - usage(1) if !$rc; -@@ -284,6 +292,30 @@ sub parse_dimm_nodes - $dimm_label_file{$str_loc} = $file; - $dimm_location{$str_loc} = $location; - -+ my $count; -+ -+ $file =~s/dimm_label/dimm_ce_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ce_count{$str_loc} = $count; -+ -+ $file =~s/dimm_ce_count/dimm_ue_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ue_count{$str_loc} = $count; -+ - return; - } - } -@@ -906,6 +938,45 @@ sub display_memory_layout - dimm_display_mem(); - } - -+sub display_error_count -+{ -+ my $sysfs_dir = "/sys/devices/system/edac/mc"; -+ my $key; -+ my $max_width = 0; -+ my %dimm_labels = (); -+ -+ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir); -+ -+ if (!scalar(keys %dimm_node)) { -+ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n"); -+ exit -1; -+ } -+ -+ foreach $key (keys %dimm_node) { -+ my $label_width; -+ -+ open IN, $dimm_label_file{$key}; -+ chomp(my $label = ); -+ close IN; -+ $label_width = length $label; -+ -+ if ($label_width > $max_width) { -+ $max_width = $label_width; -+ } -+ $dimm_labels{$key} = $label; -+ } -+ my $string = "Label"; -+ $string .= " " x ($max_width - length $string); -+ print($string . "\tCE\tUE\n"); -+ -+ foreach $key (keys %dimm_node) { -+ my $ce_count = $dimm_ce_count{$key}; -+ my $ue_count = $dimm_ue_count{$key}; -+ -+ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n"); -+ } -+} -+ - sub find_prog - { - my ($file) = @_; diff --git a/6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch b/6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch new file mode 100644 index 0000000..2d3bd32 --- /dev/null +++ b/6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch @@ -0,0 +1,37 @@ +commit 6bc43db1b6b3d73805179c21d1dd5521e8dc0f74 +Author: DmNosachev +Date: Fri Jul 2 13:13:46 2021 +0300 + + labels/supermicro: added Supermicro X11SCA(-F) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index b924a32..1e7761f 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -10,11 +10,7 @@ + # + + Vendor: Supermicro +- Model: A2SDi-8C-HLN4F +- DIMMA1: 0.0.0; DIMMA2: 0.0.1; +- DIMMB1: 0.1.0; DIMMB2: 0.1.1; +- +- Model: A2SDi-8C+-HLN4F ++ Model: A2SDi-8C-HLN4F, A2SDi-8C+-HLN4F + DIMMA1: 0.0.0; DIMMA2: 0.0.1; + DIMMB1: 0.1.0; DIMMB2: 0.1.1; + +@@ -115,4 +111,8 @@ Vendor: Supermicro + P2_DIMME1: 1.0.0; + P2_DIMMF1: 1.1.0; + P2_DIMMG1: 1.2.0; +- P2_DIMMH1: 1.3.0; +\ No newline at end of file ++ P2_DIMMH1: 1.3.0; ++ ++ Model: X11SCA, X11SCA-F ++ DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0; ++ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1; +\ No newline at end of file diff --git a/738bafafdcb2e8b0ced32fff31b13754d571090b.patch b/738bafafdcb2e8b0ced32fff31b13754d571090b.patch new file mode 100644 index 0000000..a3ba324 --- /dev/null +++ b/738bafafdcb2e8b0ced32fff31b13754d571090b.patch @@ -0,0 +1,610 @@ +commit 738bafafdcb2e8b0ced32fff31b13754d571090b +Author: Jason Tian +Date: Fri May 28 11:35:43 2021 +0800 + + Add error handling for Ampere-specific errors. + + Save Ampere-specific errors' decode into sqlite3 data + base and log PCIe segment, bus/device/function number + into BMC SEL. + + Signed-off-by: Jason Tian + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/non-standard-ampere.c b/non-standard-ampere.c +index 8cceb26..05b5252 100644 +--- a/non-standard-ampere.c ++++ b/non-standard-ampere.c +@@ -216,6 +216,13 @@ static const char * const err_bert_sub_type[] = { + "PMPRO Fatal", + }; + ++static char *sqlite3_table_list[] = { ++ "amp_payload0_event_tab", ++ "amp_payload1_event_tab", ++ "amp_payload2_event_tab", ++ "amp_payload3_event_tab", ++}; ++ + struct amp_ras_type_info { + int id; + const char *name; +@@ -352,6 +359,359 @@ static const char *oem_subtype_name(const struct amp_ras_type_info *info, + return "unknown"; + } + ++#ifdef HAVE_SQLITE3 ++/*key pair definition for ampere specific error payload type 0*/ ++static const struct db_fields amp_payload0_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "status_reg", .type = "INTEGER" }, ++ { .name = "addr_reg", .type = "INTEGER" }, ++ { .name = "misc0", .type = "INTEGER" }, ++ { .name = "misc1", .type = "INTEGER" }, ++ { .name = "misc2", .type = "INTEGER" }, ++ { .name = "misc3", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload0_event_tab = { ++ .name = "amp_payload0_event", ++ .fields = amp_payload0_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload0_event_fields), ++}; ++ ++/*key pair definition for ampere specific error payload type 1*/ ++static const struct db_fields amp_payload1_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "uncore_err_status", .type = "INTEGER" }, ++ { .name = "uncore_err_mask", .type = "INTEGER" }, ++ { .name = "uncore_err_sev", .type = "INTEGER" }, ++ { .name = "core_err_status", .type = "INTEGER" }, ++ { .name = "core_err_mask", .type = "INTEGER" }, ++ { .name = "root_err_cmd", .type = "INTEGER" }, ++ { .name = "root_err_status", .type = "INTEGER" }, ++ { .name = "src_id", .type = "INTEGER" }, ++ { .name = "reserved1", .type = "INTEGER" }, ++ { .name = "reserverd2", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload1_event_tab = { ++ .name = "amp_payload1_event", ++ .fields = amp_payload1_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload1_event_fields), ++}; ++ ++/*key pair definition for ampere specific error payload type 2*/ ++static const struct db_fields amp_payload2_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "ce_report_reg", .type = "INTEGER" }, ++ { .name = "ce_location", .type = "INTEGER" }, ++ { .name = "ce_addr", .type = "INTEGER" }, ++ { .name = "ue_report_reg", .type = "INTEGER" }, ++ { .name = "ue_location", .type = "INTEGER" }, ++ { .name = "ue_addr", .type = "INTEGER" }, ++ { .name = "reserved1", .type = "INTEGER" }, ++ { .name = "reserved2", .type = "INTEGER" }, ++ { .name = "reserved2", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload2_event_tab = { ++ .name = "amp_payload2_event", ++ .fields = amp_payload2_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload2_event_fields), ++}; ++ ++/*key pair definition for ampere specific error payload type 3*/ ++static const struct db_fields amp_payload3_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "type", .type = "TEXT" }, ++ { .name = "subtype", .type = "TEXT" }, ++ { .name = "instance", .type = "INTEGER" }, ++ { .name = "socket_num", .type = "INTEGER" }, ++ { .name = "fw_spec_data0", .type = "INTEGER" }, ++ { .name = "fw_spec_data1", .type = "INTEGER" }, ++ { .name = "fw_spec_data2", .type = "INTEGER" }, ++ { .name = "fw_spec_data3", .type = "INTEGER" }, ++ { .name = "fw_spec_data4", .type = "INTEGER" }, ++ { .name = "fw_spec_data5", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor amp_payload3_event_tab = { ++ .name = "amp_payload3_event", ++ .fields = amp_payload3_event_fields, ++ .num_fields = ARRAY_SIZE(amp_payload3_event_fields), ++}; ++ ++/*Save data with different type into sqlite3 db*/ ++static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder, ++ enum amp_oem_data_type data_type, ++ int id, int64_t data, const char *text) ++{ ++ switch (data_type) { ++ case AMP_OEM_DATA_TYPE_INT: ++ sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data); ++ break; ++ case AMP_OEM_DATA_TYPE_INT64: ++ sqlite3_bind_int64(ev_decoder->stmt_dec_record, id, data); ++ break; ++ case AMP_OEM_DATA_TYPE_TEXT: ++ sqlite3_bind_text(ev_decoder->stmt_dec_record, id, ++ text, -1, NULL); ++ break; ++ default: ++ break; ++ } ++} ++ ++static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, ++ const char *name) ++{ ++ int rc; ++ ++ rc = sqlite3_step(ev_decoder->stmt_dec_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do %s step on sqlite: error = %d\n", name, rc); ++ ++ rc = sqlite3_reset(ev_decoder->stmt_dec_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to reset %s on sqlite: error = %d\n", name, rc); ++ ++ rc = sqlite3_clear_bindings(ev_decoder->stmt_dec_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to clear bindings %s on sqlite: error = %d\n", ++ name, rc); ++ ++ return rc; ++} ++ ++/*save all Ampere Specific Error Payload type 0 to sqlite3 database*/ ++static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload0_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD0_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD0_FIELD_SUB_TYPE, 0, subtype_str); ++ ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD0_FIELD_INS, INSTANCE(err->instance), NULL); ++ ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD0_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD0_FIELD_STATUS_REG, err->err_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_ADDR_REG, ++ err->err_addr, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC0, ++ err->err_misc_0, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC1, ++ err->err_misc_1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC2, ++ err->err_misc_2, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD0_FIELD_MISC3, ++ err->err_misc_3, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload0_event_tab"); ++ } ++} ++ ++/*save all Ampere Specific Error Payload type 1 to sqlite3 database*/ ++static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload1_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD1_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD1_FIELD_SUB_TYPE, 0, subtype_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_INS, ++ INSTANCE(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS, ++ err->uncore_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK, ++ err->uncore_mask, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV, ++ err->uncore_sev, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS, ++ err->core_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK, ++ err->core_mask, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD, ++ err->root_err_cmd, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS, ++ err->root_status, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_SRC_ID, ++ err->src_id, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD1_FIELD_RESERVED1, ++ err->reserved1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD1_FIELD_RESERVED2, ++ err->reserved2, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload1_event_tab"); ++ } ++} ++ ++/*save all Ampere Specific Error Payload type 2 to sqlite3 database*/ ++static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload2_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD2_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD2_FIELD_SUB_TYPE, 0, subtype_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_INS, INSTANCE(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_CE_REPORT_REG, ++ err->ce_register, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_CE_LOACATION, ++ err->ce_location, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_CE_ADDR, ++ err->ce_addr, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_UE_REPORT_REG, ++ err->ue_register, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_UE_LOCATION, ++ err->ue_location, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_UE_ADDR, ++ err->ue_addr, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD2_FIELD_RESERVED1, ++ err->reserved1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD2_FIELD_RESERVED2, ++ err->reserved2, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD2_FIELD_RESERVED3, ++ err->reserved3, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload2_event_tab"); ++ } ++} ++ ++/*save all Ampere Specific Error Payload type 3 to sqlite3 database*/ ++static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload3_type_sec *err) ++{ ++ if (ev_decoder != NULL) { ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD3_FIELD_TYPE, 0, type_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ AMP_PAYLOAD3_FIELD_SUB_TYPE, 0, subtype_str); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD3_FIELD_INS, INSTANCE(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD3_FIELD_SOCKET_NUM, ++ SOCKET_NUM(err->instance), NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0, ++ err->fw_speci_data0, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1, ++ err->fw_speci_data1, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2, ++ err->fw_speci_data2, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3, ++ err->fw_speci_data3, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4, ++ err->fw_speci_data4, NULL); ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5, ++ err->fw_speci_data5, NULL); ++ store_amp_err_data(ev_decoder, "amp_payload3_event_tab"); ++ } ++} ++ ++#else ++static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder, ++ enum amp_oem_data_type data_type, ++ int id, int64_t data, const char *text) ++{ ++ return 0; ++} ++ ++static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload0_type_sec *err) ++{ ++ return 0; ++} ++ ++static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload1_type_sec *err) ++{ ++ return 0; ++} ++ ++static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload2_type_sec *err) ++{ ++ return 0; ++} ++ ++static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder, ++ const char *type_str, const char *subtype_str, ++ const struct amp_payload3_type_sec *err) ++{ ++ return 0; ++} ++ ++static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, char *name) ++{ ++ return 0; ++} ++#endif + + /*decode ampere specific error payload type 0, the CPU's data is save*/ + /*to sqlite by ras-arm-handler, others are saved by this function.*/ +@@ -434,6 +794,7 @@ void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload0_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -517,6 +878,7 @@ static void decode_amp_payload1_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload1_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -601,6 +963,7 @@ static void decode_amp_payload2_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload2_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -673,6 +1036,7 @@ static void decode_amp_payload3_err_regs(struct ras_ns_ev_decoder *ev_decoder, + *p = '\0'; + } + ++ record_amp_payload3_err(ev_decoder, type_str, subtype_str, err); + i = 0; + p = NULL; + end = NULL; +@@ -687,6 +1051,38 @@ static int decode_amp_oem_type_error(struct ras_events *ras, + { + int payload_type = PAYLOAD_TYPE(event->error[0]); + ++#ifdef HAVE_SQLITE3 ++ struct db_table_descriptor db_tab; ++ int id = 0; ++ ++ if (payload_type == PAYLOAD_TYPE_0) { ++ db_tab = amp_payload0_event_tab; ++ id = AMP_PAYLOAD0_FIELD_TIMESTAMP; ++ } else if (payload_type == PAYLOAD_TYPE_1) { ++ db_tab = amp_payload1_event_tab; ++ id = AMP_PAYLOAD1_FIELD_TIMESTAMP; ++ } else if (payload_type == PAYLOAD_TYPE_2) { ++ db_tab = amp_payload2_event_tab; ++ id = AMP_PAYLOAD2_FIELD_TIMESTAMP; ++ } else if (payload_type == PAYLOAD_TYPE_3) { ++ db_tab = amp_payload3_event_tab; ++ id = AMP_PAYLOAD3_FIELD_TIMESTAMP; ++ } else ++ return -1; ++ ++ if (!ev_decoder->stmt_dec_record) { ++ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, ++ &db_tab) != SQLITE_OK) { ++ trace_seq_printf(s, ++ "create sql %s fail\n", ++ sqlite3_table_list[payload_type]); ++ return -1; ++ } ++ } ++ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, ++ id, 0, event->timestamp); ++#endif ++ + if (payload_type == PAYLOAD_TYPE_0) { + const struct amp_payload0_type_sec *err = + (struct amp_payload0_type_sec *)event->error; +diff --git a/non-standard-ampere.h b/non-standard-ampere.h +index aacf3a8..f463c53 100644 +--- a/non-standard-ampere.h ++++ b/non-standard-ampere.h +@@ -102,6 +102,79 @@ struct amp_payload3_type_sec { + uint64_t fw_speci_data5; + }; + ++enum amp_oem_data_type { ++ AMP_OEM_DATA_TYPE_INT, ++ AMP_OEM_DATA_TYPE_INT64, ++ AMP_OEM_DATA_TYPE_TEXT, ++}; ++ ++enum { ++ AMP_PAYLOAD0_FIELD_ID, ++ AMP_PAYLOAD0_FIELD_TIMESTAMP, ++ AMP_PAYLOAD0_FIELD_TYPE, ++ AMP_PAYLOAD0_FIELD_SUB_TYPE, ++ AMP_PAYLOAD0_FIELD_INS, ++ AMP_PAYLOAD0_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD0_FIELD_STATUS_REG, ++ AMP_PAYLOAD0_FIELD_ADDR_REG, ++ AMP_PAYLOAD0_FIELD_MISC0, ++ AMP_PAYLOAD0_FIELD_MISC1, ++ AMP_PAYLOAD0_FIELD_MISC2, ++ AMP_PAYLOAD0_FIELD_MISC3, ++}; ++ ++enum { ++ AMP_PAYLOAD1_FIELD_ID, ++ AMP_PAYLOAD1_FIELD_TIMESTAMP, ++ AMP_PAYLOAD1_FIELD_TYPE, ++ AMP_PAYLOAD1_FIELD_SUB_TYPE, ++ AMP_PAYLOAD1_FIELD_INS, ++ AMP_PAYLOAD1_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK, ++ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS, ++ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD, ++ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS, ++ AMP_PAYLOAD1_FIELD_SRC_ID, ++ AMP_PAYLOAD1_FIELD_RESERVED1, ++ AMP_PAYLOAD1_FIELD_RESERVED2, ++}; ++ ++enum { ++ AMP_PAYLOAD2_FIELD_ID, ++ AMP_PAYLOAD2_FIELD_TIMESTAMP, ++ AMP_PAYLOAD2_FIELD_TYPE, ++ AMP_PAYLOAD2_FIELD_SUB_TYPE, ++ AMP_PAYLOAD2_FIELD_INS, ++ AMP_PAYLOAD2_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD2_FIELD_CE_REPORT_REG, ++ AMP_PAYLOAD2_FIELD_CE_LOACATION, ++ AMP_PAYLOAD2_FIELD_CE_ADDR, ++ AMP_PAYLOAD2_FIELD_UE_REPORT_REG, ++ AMP_PAYLOAD2_FIELD_UE_LOCATION, ++ AMP_PAYLOAD2_FIELD_UE_ADDR, ++ AMP_PAYLOAD2_FIELD_RESERVED1, ++ AMP_PAYLOAD2_FIELD_RESERVED2, ++ AMP_PAYLOAD2_FIELD_RESERVED3, ++}; ++ ++enum { ++ AMP_PAYLOAD3_FIELD_ID, ++ AMP_PAYLOAD3_FIELD_TIMESTAMP, ++ AMP_PAYLOAD3_FIELD_TYPE, ++ AMP_PAYLOAD3_FIELD_SUB_TYPE, ++ AMP_PAYLOAD3_FIELD_INS, ++ AMP_PAYLOAD3_FIELD_SOCKET_NUM, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4, ++ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5 ++}; ++ + void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder, + struct trace_seq *s, + const struct amp_payload0_type_sec *err); +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 8ddd439..6f4cb2b 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -67,6 +67,9 @@ int ras_aer_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_aer_event ev; + char buf[BUF_LEN]; ++ char ipmi_add_sel[105]; ++ uint8_t sel_data[5]; ++ int seg, bus, dev, fn; + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -129,15 +132,19 @@ int ras_aer_event_handler(struct trace_seq *s, + switch (severity_val) { + case HW_EVENT_AER_UNCORRECTED_NON_FATAL: + ev.error_type = "Uncorrected (Non-Fatal)"; ++ sel_data[0] = 0xca; + break; + case HW_EVENT_AER_UNCORRECTED_FATAL: + ev.error_type = "Uncorrected (Fatal)"; ++ sel_data[0] = 0xca; + break; + case HW_EVENT_AER_CORRECTED: + ev.error_type = "Corrected"; ++ sel_data[0] = 0xbf; + break; + default: + ev.error_type = "Unknown severity"; ++ sel_data[0] = 0xbf; + } + trace_seq_puts(s, ev.error_type); + +@@ -151,5 +158,29 @@ int ras_aer_event_handler(struct trace_seq *s, + ras_report_aer_event(ras, &ev); + #endif + ++#ifdef HAVE_AMP_NS_DECODE ++ /* ++ * Get PCIe AER error source seg/bus/dev/fn and save it into ++ * BMC OEM SEL, ipmitool raw 0x0a 0x44 is IPMI command-Add SEL ++ * entry, please refer IPMI specificaiton chapter 31.6. 0xcd3a ++ * is manufactuer ID(ampere),byte 12 is sensor num(CE is 0xBF, ++ * UE is 0xCA), byte 13~14 is segment number, byte 15 is bus ++ * number, byte 16[7:3] is device number, byte 16[2:0] is ++ * function number ++ */ ++ sscanf(ev.dev_name, "%x:%x:%x.%x", &seg, &bus, &dev, &fn); ++ ++ sel_data[1] = seg & 0xff; ++ sel_data[2] = (seg & 0xff00) >> 8; ++ sel_data[3] = bus; ++ sel_data[4] = (((dev & 0x1f) << 3) | (fn & 0x7)); ++ ++ sprintf(ipmi_add_sel, ++ "ipmitool raw 0x0a 0x44 0x00 0x00 0xc0 0x00 0x00 0x00 0x00 0x3a 0xcd 0x00 0xc0 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x", ++ sel_data[0], sel_data[1], sel_data[2], sel_data[3], sel_data[4]); ++ ++ system(ipmi_add_sel); ++#endif ++ + return 0; + } diff --git a/7ccf12f5ae26a055926d175d908c7930293438c4.patch b/7ccf12f5ae26a055926d175d908c7930293438c4.patch new file mode 100644 index 0000000..5a7a860 --- /dev/null +++ b/7ccf12f5ae26a055926d175d908c7930293438c4.patch @@ -0,0 +1,26 @@ +commit 7ccf12f5ae26a055926d175d908c7930293438c4 +Author: DmNosachev +Date: Fri Jul 23 17:28:33 2021 +0300 + + labels/supermicro: added Supermicro X11SCW + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 990fc9e..aea7c3c 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -133,4 +133,10 @@ Vendor: Supermicro + + Model: X11SCA, X11SCA-F + DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0; +- DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1; +\ No newline at end of file ++ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1; ++ ++ Model: X11SCW-F ++ DIMMA1: 0.1.0; ++ DIMMA2: 0.0.0; ++ DIMMB1: 0.1.1; ++ DIMMB2: 0.0.1; +\ No newline at end of file diff --git a/854364ba44aee9bc5646f6537fc744b0b54aff37.patch b/854364ba44aee9bc5646f6537fc744b0b54aff37.patch deleted file mode 100644 index 91bad1b..0000000 --- a/854364ba44aee9bc5646f6537fc744b0b54aff37.patch +++ /dev/null @@ -1,38 +0,0 @@ -commit 854364ba44aee9bc5646f6537fc744b0b54aff37 -Author: Muralidhara M K -Date: Thu Aug 20 21:00:57 2020 +0530 - - rasdaemon: Add 8 channel decoding for SMCA systems - - Current Scalable Machine Check Architecture (SMCA) systems support up - to 8 UMC channels. - - To find the UMC channel represented by a bank, look at the 6th nibble - in the MCA_IPID[InstanceId] field. - - Signed-off-by: Muralidhara M K - [ Adjust commit message. ] - Signed-off-by: Yazen Ghannam - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index d0b6cb6..7c619fd 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e) - */ - static int find_umc_channel(struct mce_event *e) - { -- uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; -- uint32_t instance_id = EXTRACT(e->ipid, 0, 31); -- int i, channel = -1; -- -- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) -- if (umc_instance_id[i] == instance_id) -- channel = i; -- -- return channel; -+ return EXTRACT(e->ipid, 0, 31) >> 20; - } - /* Decode extended errors according to Scalable MCA specification */ - static void decode_smca_error(struct mce_event *e) diff --git a/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch b/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch deleted file mode 100644 index e3617fc..0000000 --- a/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch +++ /dev/null @@ -1,207 +0,0 @@ -commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6 -Author: Brian WoodsGhannam, Yazen -Date: Fri Nov 1 15:48:14 2019 +0100 - - rasdaemon: add support for new AMD SMCA bank types - - Going forward, the Scalable Machine Check Architecture (SMCA) has some - updated and additional bank types which show up in Zen2. The differing - bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2 - bank types replace the original bank types but have unique HWID/MCAtype - IDs from the originals so there's no conflicts between different - versions or other bank types. All of the differing bank types have new - MCE descriptions which have been added as well. - - CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types - Signed-off-by: Brian Woods - Signed-off-by: Yazen Ghannam - Cc: Chandu-babu Namburu - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 6c3e8a5..114e786 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -49,11 +49,17 @@ enum smca_bank_types { - SMCA_FP, /* Floating Point */ - SMCA_L3_CACHE, /* L3 Cache */ - SMCA_CS, /* Coherent Slave */ -+ SMCA_CS_V2, /* Coherent Slave V2 */ - SMCA_PIE, /* Power, Interrupts, etc. */ - SMCA_UMC, /* Unified Memory Controller */ - SMCA_PB, /* Parameter Block */ - SMCA_PSP, /* Platform Security Processor */ -+ SMCA_PSP_V2, /* Platform Security Processor V2 */ - SMCA_SMU, /* System Management Unit */ -+ SMCA_SMU_V2, /* System Management Unit V2 */ -+ SMCA_MP5, /* Microprocessor 5 Unit */ -+ SMCA_NBIO, /* Northbridge IO Unit */ -+ SMCA_PCIE, /* PCI Express Unit */ - N_SMCA_BANK_TYPES - }; - -@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = { - "Atomic request parity", - "ECC error on probe filter access", - }; -+/* Coherent Slave Unit V2 */ -+static const char * const smca_cs2_mce_desc[] = { -+ "Illegal Request", -+ "Address Violation", -+ "Security Violation", -+ "Illegal Response", -+ "Unexpected Response", -+ "Request or Probe Parity Error", -+ "Read Response Parity Error", -+ "Atomic Request Parity Error", -+ "SDP read response had no match in the CS queue", -+ "Probe Filter Protocol Error", -+ "Probe Filter ECC Error", -+ "SDP read response had an unexpected RETRY error", -+ "Counter overflow error", -+ "Counter underflow error", -+}; - /* Power, Interrupt, etc.. */ - static const char * const smca_pie_mce_desc[] = { - "HW assert", -@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = { - static const char * const smca_psp_mce_desc[] = { - "PSP RAM ECC or parity error", - }; -+/* Platform Security Processor V2 */ -+static const char * const smca_psp2_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Instruction Cache Bank 0 ECC or parity error", -+ "Instruction Cache Bank 1 ECC or parity error", -+ "Instruction Tag Ram 0 parity error", -+ "Instruction Tag Ram 1 parity error", -+ "Data Cache Bank 0 ECC or parity error", -+ "Data Cache Bank 1 ECC or parity error", -+ "Data Cache Bank 2 ECC or parity error", -+ "Data Cache Bank 3 ECC or parity error", -+ "Data Tag Bank 0 parity error", -+ "Data Tag Bank 1 parity error", -+ "Data Tag Bank 2 parity error", -+ "Data Tag Bank 3 parity error", -+ "Dirty Data Ram parity error", -+ "TLB Bank 0 parity error", -+ "TLB Bank 1 parity error", -+ "System Hub Read Buffer ECC or parity error", -+}; - /* System Management Unit */ - static const char * const smca_smu_mce_desc[] = { - "SMU RAM ECC or parity error", - }; -+/* System Management Unit V2 */ -+static const char * const smca_smu2_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Data Cache Bank A ECC or parity error", -+ "Data Cache Bank B ECC or parity error", -+ "Data Tag Cache Bank A ECC or parity error", -+ "Data Tag Cache Bank B ECC or parity error", -+ "Instruction Cache Bank A ECC or parity error", -+ "Instruction Cache Bank B ECC or parity error", -+ "Instruction Tag Cache Bank A ECC or parity error", -+ "Instruction Tag Cache Bank B ECC or parity error", -+ "System Hub Read Buffer ECC or parity error", -+}; -+/* Microprocessor 5 Unit */ -+static const char * const smca_mp5_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Data Cache Bank A ECC or parity error", -+ "Data Cache Bank B ECC or parity error", -+ "Data Tag Cache Bank A ECC or parity error", -+ "Data Tag Cache Bank B ECC or parity error", -+ "Instruction Cache Bank A ECC or parity error", -+ "Instruction Cache Bank B ECC or parity error", -+ "Instruction Tag Cache Bank A ECC or parity error", -+ "Instruction Tag Cache Bank B ECC or parity error", -+}; -+/* Northbridge IO Unit */ -+static const char * const smca_nbio_mce_desc[] = { -+ "ECC or Parity error", -+ "PCIE error", -+ "SDP ErrEvent error", -+ "SDP Egress Poison Error", -+ "IOHC Internal Poison Error", -+}; -+/* PCI Express Unit */ -+static const char * const smca_pcie_mce_desc[] = { -+ "CCIX PER Message logging", -+ "CCIX Read Response with Status: Non-Data Error", -+ "CCIX Write Response with Status: Non-Data Error", -+ "CCIX Read Response with Status: Data Error", -+ "CCIX Non-okay write response with data error", -+}; -+ - - struct smca_mce_desc { - const char * const *descs; -@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, - [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, - [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, -+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, - [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, - [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, - [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, - [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, -+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, - [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, -+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)}, -+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, -+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, -+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, - }; - - struct smca_hwid { -@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* Data Fabric MCA types */ - { SMCA_CS, 0x0000002E }, -+ { SMCA_CS_V2, 0x0002002E }, - { SMCA_PIE, 0x0001002E }, - - /* Unified Memory Controller MCA type */ -@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* Platform Security Processor MCA type */ - { SMCA_PSP, 0x000000FF }, -+ { SMCA_PSP_V2, 0x000100FF }, - - /* System Management Unit MCA type */ - { SMCA_SMU, 0x00000001 }, -+ { SMCA_SMU_V2, 0x00010001 }, -+ -+ /* Microprocessor 5 Unit MCA type */ -+ { SMCA_MP5, 0x00020001 }, -+ -+ /* Northbridge IO Unit MCA type */ -+ { SMCA_NBIO, 0x00000018 }, -+ -+ /* PCI Express Unit MCA type */ -+ { SMCA_PCIE, 0x00000046 }, - }; - - struct smca_bank_name { -@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = { - [SMCA_FP] = { "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "L3 Cache" }, - [SMCA_CS] = { "Coherent Slave" }, -+ [SMCA_CS_V2] = { "Coherent Slave" }, - [SMCA_PIE] = { "Power, Interrupts, etc." }, - [SMCA_UMC] = { "Unified Memory Controller" }, - [SMCA_PB] = { "Parameter Block" }, - [SMCA_PSP] = { "Platform Security Processor" }, -+ [SMCA_PSP_V2] = { "Platform Security Processor" }, - [SMCA_SMU] = { "System Management Unit" }, -+ [SMCA_SMU_V2] = { "System Management Unit" }, -+ [SMCA_MP5] = { "Microprocessor 5 Unit" }, -+ [SMCA_NBIO] = { "Northbridge IO Unit" }, -+ [SMCA_PCIE] = { "PCI Express Unit" }, - }; - - static void amd_decode_errcode(struct mce_event *e) diff --git a/9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch b/9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch new file mode 100644 index 0000000..5267fc8 --- /dev/null +++ b/9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch @@ -0,0 +1,51 @@ +commit 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b +Author: Muralidhara M K +Date: Tue Jul 27 06:36:45 2021 -0500 + + rasdaemon: ras-mc-ctl: Fix script to parse dimm sizes + + Removes trailing spaces at the end of a line from + file location and fixes --layout option to parse dimm nodes + to get the size of each dimm from ras-mc-ctl. + + Issue is reported https://github.com/mchehab/rasdaemon/issues/43 + Where '> ras-mc-ctl --layout' reports all 0s + + With this change the layout option prints the correct dimm sizes + > sudo ras-mc-ctl --layout + +-----------------------------------------------+ + | mc0 | + | csrow0 | csrow1 | csrow2 | csrow3 | + ----------+-----------------------------------------------+ + ... + channel7: | 16384 MB | 0 MB | 0 MB | 0 MB | + channel6: | 16384 MB | 0 MB | 0 MB | 0 MB | + ... + ----------+-----------------------------------------------+ + + Signed-off-by: Muralidhara M K + Signed-off-by: Naveen Krishna Chatradhi + Cc: Yazen Ghannam + Signed-off-by: Mauro Carvalho Chehab + Link: https://lkml.kernel.org/r/20210810183855.129076-1-nchatrad@amd.com/ + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1e3aeb7..b22dd60 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -246,6 +246,7 @@ sub parse_dimm_nodes + if (($file =~ /max_location$/)) { + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + my @temp = split(/ /, $location); + +@@ -288,6 +289,7 @@ sub parse_dimm_nodes + + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + + my @pos; diff --git a/9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch b/9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch new file mode 100644 index 0000000..1a221ea --- /dev/null +++ b/9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch @@ -0,0 +1,40 @@ +commit 9a5baed97b21af31064d9995ffcfaac0e9d7983e +Author: DmNosachev +Date: Tue Jun 29 13:37:48 2021 +0300 + + labels/supermicro: supermicro db syntax + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index bfaed93..47ea05f 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -18,17 +18,17 @@ Vendor: Supermicro + DIMMA1: 0.0.0; DIMMA2: 0.0.1; + DIMMB1: 0.1.0; DIMMB2: 0.1.1; + +- Product: X10SRA-F +- DIMMA1: 0.0.0 +- DIMMA2: 0.0.1 +- DIMMB1: 0.1.0 +- DIMMB2: 0.1.1 +- DIMMC1: 1.0.0 +- DIMMC2: 1.0.1 +- DIMMD1: 1.1.0 +- DIMMD2: 1.1.1 ++ Model: X10SRA-F ++ DIMMA1: 0.0.0; ++ DIMMA2: 0.0.1; ++ DIMMB1: 0.1.0; ++ DIMMB2: 0.1.1; ++ DIMMC1: 1.0.0; ++ DIMMC2: 1.0.1; ++ DIMMD1: 1.1.0; ++ DIMMD2: 1.1.1; + +- Product: H8DGU ++ Model: H8DGU + P1_DIMM1A: 0.2.0; + P1_DIMM1A: 0.3.0; + P2_DIMM1A: 3.2.0; diff --git a/a16ca0711001957ee98f2c124abce0fa1f801529.patch b/a16ca0711001957ee98f2c124abce0fa1f801529.patch deleted file mode 100644 index 3a96263..0000000 --- a/a16ca0711001957ee98f2c124abce0fa1f801529.patch +++ /dev/null @@ -1,670 +0,0 @@ -commit a16ca0711001957ee98f2c124abce0fa1f801529 -Author: Chandu-babu Namburu -Date: Wed Jan 30 20:36:45 2019 +0530 - - rasdaemon: add support for AMD Scalable MCA - - Add logic here to decode errors from all known IP blocks for - AMD Scalable MCA supported processors - - Reviewed-by: Yazen Ghannam - Signed-off-by: Chandu-babu Namburu - ---- - mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - mce-amd.c | 122 +++++++++++++++++ - ras-mce-handler.c | 24 +++ - ras-mce-handler.h | 15 ++ - 4 files changed, 530 insertions(+), 2 deletions(-) - ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400 -@@ -0,0 +1,371 @@ -+/* -+ * Copyright (c) 2018, AMD, Inc. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 and -+ * only version 2 as published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+ -+#include "ras-mce-handler.h" -+#include "bitfield.h" -+ -+/* MCA_STATUS REGISTER FOR FAMILY 17H -+ *********************** Higher 32-bits ***************************** -+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE, -+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid, -+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet, -+ * 51: RES, 50: RES, 49: RES, 48: RES, -+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred, -+ * 43: Poison, 42: RES, 41: RES, 40: RES, -+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4], -+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0] -+ *********************** Lower 32-bits ****************************** -+ * 31: RES, 30: RES, 29: RES, 28: RES, -+ * 27: RES, 26: RES, 25: RES, 24: RES -+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4], -+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0] -+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12], -+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8], -+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4], -+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0] -+ */ -+ -+/* These may be used by multiple smca_hwid_mcatypes */ -+enum smca_bank_types { -+ SMCA_LS = 0, /* Load Store */ -+ SMCA_IF, /* Instruction Fetch */ -+ SMCA_L2_CACHE, /* L2 Cache */ -+ SMCA_DE, /* Decoder Unit */ -+ SMCA_RESERVED, /* Reserved */ -+ SMCA_EX, /* Execution Unit */ -+ SMCA_FP, /* Floating Point */ -+ SMCA_L3_CACHE, /* L3 Cache */ -+ SMCA_CS, /* Coherent Slave */ -+ SMCA_PIE, /* Power, Interrupts, etc. */ -+ SMCA_UMC, /* Unified Memory Controller */ -+ SMCA_PB, /* Parameter Block */ -+ SMCA_PSP, /* Platform Security Processor */ -+ SMCA_SMU, /* System Management Unit */ -+ N_SMCA_BANK_TYPES -+}; -+ -+/* SMCA Extended error strings */ -+/* Load Store */ -+static const char * const smca_ls_mce_desc[] = { -+ "Load queue parity", -+ "Store queue parity", -+ "Miss address buffer payload parity", -+ "L1 TLB parity", -+ "Reserved", -+ "DC tag error type 6", -+ "DC tag error type 1", -+ "Internal error type 1", -+ "Internal error type 2", -+ "Sys Read data error thread 0", -+ "Sys read data error thread 1", -+ "DC tag error type 2", -+ "DC data error type 1 (poison consumption)", -+ "DC data error type 2", -+ "DC data error type 3", -+ "DC tag error type 4", -+ "L2 TLB parity", -+ "PDC parity error", -+ "DC tag error type 3", -+ "DC tag error type 5", -+ "L2 fill data error", -+}; -+/* Instruction Fetch */ -+static const char * const smca_if_mce_desc[] = { -+ "microtag probe port parity error", -+ "IC microtag or full tag multi-hit error", -+ "IC full tag parity", -+ "IC data array parity", -+ "Decoupling queue phys addr parity error", -+ "L0 ITLB parity error", -+ "L1 ITLB parity error", -+ "L2 ITLB parity error", -+ "BPQ snoop parity on Thread 0", -+ "BPQ snoop parity on Thread 1", -+ "L1 BTB multi-match error", -+ "L2 BTB multi-match error", -+ "L2 Cache Response Poison error", -+ "System Read Data error", -+}; -+/* L2 Cache */ -+static const char * const smca_l2_mce_desc[] = { -+ "L2M tag multi-way-hit error", -+ "L2M tag ECC error", -+ "L2M data ECC error", -+ "HW assert", -+}; -+/* Decoder Unit */ -+static const char * const smca_de_mce_desc[] = { -+ "uop cache tag parity error", -+ "uop cache data parity error", -+ "Insn buffer parity error", -+ "uop queue parity error", -+ "Insn dispatch queue parity error", -+ "Fetch address FIFO parity", -+ "Patch RAM data parity", -+ "Patch RAM sequencer parity", -+ "uop buffer parity" -+}; -+/* Execution Unit */ -+static const char * const smca_ex_mce_desc[] = { -+ "Watchdog timeout error", -+ "Phy register file parity", -+ "Flag register file parity", -+ "Immediate displacement register file parity", -+ "Address generator payload parity", -+ "EX payload parity", -+ "Checkpoint queue parity", -+ "Retire dispatch queue parity", -+ "Retire status queue parity error", -+ "Scheduling queue parity error", -+ "Branch buffer queue parity error", -+}; -+/* Floating Point Unit */ -+static const char * const smca_fp_mce_desc[] = { -+ "Physical register file parity", -+ "Freelist parity error", -+ "Schedule queue parity", -+ "NSQ parity error", -+ "Retire queue parity", -+ "Status register file parity", -+ "Hardware assertion", -+}; -+/* L3 Cache */ -+static const char * const smca_l3_mce_desc[] = { -+ "Shadow tag macro ECC error", -+ "Shadow tag macro multi-way-hit error", -+ "L3M tag ECC error", -+ "L3M tag multi-way-hit error", -+ "L3M data ECC error", -+ "XI parity, L3 fill done channel error", -+ "L3 victim queue parity", -+ "L3 HW assert", -+}; -+/* Coherent Slave Unit */ -+static const char * const smca_cs_mce_desc[] = { -+ "Illegal request from transport layer", -+ "Address violation", -+ "Security violation", -+ "Illegal response from transport layer", -+ "Unexpected response", -+ "Parity error on incoming request or probe response data", -+ "Parity error on incoming read response data", -+ "Atomic request parity", -+ "ECC error on probe filter access", -+}; -+/* Power, Interrupt, etc.. */ -+static const char * const smca_pie_mce_desc[] = { -+ "HW assert", -+ "Internal PIE register security violation", -+ "Error on GMI link", -+ "Poison data written to internal PIE register", -+}; -+/* Unified Memory Controller */ -+static const char * const smca_umc_mce_desc[] = { -+ "DRAM ECC error", -+ "Data poison error on DRAM", -+ "SDP parity error", -+ "Advanced peripheral bus error", -+ "Command/address parity error", -+ "Write data CRC error", -+}; -+/* Parameter Block */ -+static const char * const smca_pb_mce_desc[] = { -+ "Parameter Block RAM ECC error", -+}; -+/* Platform Security Processor */ -+static const char * const smca_psp_mce_desc[] = { -+ "PSP RAM ECC or parity error", -+}; -+/* System Management Unit */ -+static const char * const smca_smu_mce_desc[] = { -+ "SMU RAM ECC or parity error", -+}; -+ -+struct smca_mce_desc { -+ const char * const *descs; -+ unsigned int num_descs; -+}; -+ -+static struct smca_mce_desc smca_mce_descs[] = { -+ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, -+ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, -+ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, -+ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, -+ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, -+ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, -+ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, -+ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, -+ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, -+ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, -+ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, -+ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, -+ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, -+}; -+ -+struct smca_hwid { -+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/ -+ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/ -+}; -+ -+static struct smca_hwid smca_hwid_mcatypes[] = { -+ /* { bank_type, mcatype_hwid } */ -+ -+ /* ZN Core (HWID=0xB0) MCA types */ -+ { SMCA_LS, 0x000000B0 }, -+ { SMCA_IF, 0x000100B0 }, -+ { SMCA_L2_CACHE, 0x000200B0 }, -+ { SMCA_DE, 0x000300B0 }, -+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */ -+ { SMCA_EX, 0x000500B0 }, -+ { SMCA_FP, 0x000600B0 }, -+ { SMCA_L3_CACHE, 0x000700B0 }, -+ -+ /* Data Fabric MCA types */ -+ { SMCA_CS, 0x0000002E }, -+ { SMCA_PIE, 0x0001002E }, -+ -+ /* Unified Memory Controller MCA type */ -+ { SMCA_UMC, 0x00000096 }, -+ -+ /* Parameter Block MCA type */ -+ { SMCA_PB, 0x00000005 }, -+ -+ /* Platform Security Processor MCA type */ -+ { SMCA_PSP, 0x000000FF }, -+ -+ /* System Management Unit MCA type */ -+ { SMCA_SMU, 0x00000001 }, -+}; -+ -+struct smca_bank_name { -+ const char *name; -+}; -+ -+static struct smca_bank_name smca_names[] = { -+ [SMCA_LS] = { "Load Store Unit" }, -+ [SMCA_IF] = { "Instruction Fetch Unit" }, -+ [SMCA_L2_CACHE] = { "L2 Cache" }, -+ [SMCA_DE] = { "Decode Unit" }, -+ [SMCA_RESERVED] = { "Reserved" }, -+ [SMCA_EX] = { "Execution Unit" }, -+ [SMCA_FP] = { "Floating Point Unit" }, -+ [SMCA_L3_CACHE] = { "L3 Cache" }, -+ [SMCA_CS] = { "Coherent Slave" }, -+ [SMCA_PIE] = { "Power, Interrupts, etc." }, -+ [SMCA_UMC] = { "Unified Memory Controller" }, -+ [SMCA_PB] = { "Parameter Block" }, -+ [SMCA_PSP] = { "Platform Security Processor" }, -+ [SMCA_SMU] = { "System Management Unit" }, -+}; -+ -+static void amd_decode_errcode(struct mce_event *e) -+{ -+ -+ decode_amd_errcode(e); -+ -+ if (e->status & MCI_STATUS_POISON) -+ mce_snprintf(e->mcistatus_msg, "Poison consumed"); -+ -+ if (e->status & MCI_STATUS_TCC) -+ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt"); -+ -+} -+/* -+ * To find the UMC channel represented by this bank we need to match on its -+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its -+ * IPID. -+ */ -+static int find_umc_channel(struct mce_event *e) -+{ -+ uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; -+ uint32_t instance_id = EXTRACT(e->ipid, 0, 31); -+ int i, channel = -1; -+ -+ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) -+ if (umc_instance_id[i] == instance_id) -+ channel = i; -+ -+ return channel; -+} -+/* Decode extended errors according to Scalable MCA specification */ -+static void decode_smca_error(struct mce_event *e) -+{ -+ enum smca_bank_types bank_type; -+ const char *ip_name; -+ unsigned short xec = (e->status >> 16) & 0x3f; -+ const struct smca_hwid *s_hwid; -+ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63); -+ unsigned int csrow = -1, channel = -1; -+ unsigned int i; -+ -+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { -+ s_hwid = &smca_hwid_mcatypes[i]; -+ if (mcatype_hwid == s_hwid->mcatype_hwid) { -+ bank_type = s_hwid->bank_type; -+ break; -+ } -+ } -+ -+ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) { -+ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID"); -+ return; -+ } -+ -+ if (bank_type >= N_SMCA_BANK_TYPES) { -+ strcpy(e->mcastatus_msg, "Don't know how to decode this bank"); -+ return; -+ } -+ -+ if (bank_type == SMCA_RESERVED) { -+ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n"); -+ return; -+ } -+ -+ ip_name = smca_names[bank_type].name; -+ -+ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank); -+ -+ /* Only print the descriptor of valid extended error code */ -+ if (xec < smca_mce_descs[bank_type].num_descs) -+ mce_snprintf(e->mcastatus_msg, -+ " %s.\n", smca_mce_descs[bank_type].descs[xec]); -+ -+ if (bank_type == SMCA_UMC && xec == 0) { -+ channel = find_umc_channel(e); -+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */ -+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", -+ channel, csrow); -+ } -+} -+ -+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) -+{ -+ uint64_t mcgstatus = e->mcgstatus; -+ -+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld", -+ (long long)e->mcgstatus); -+ -+ if (mcgstatus & MCG_STATUS_RIPV) -+ mce_snprintf(e->mcgstatus_msg, "RIPV"); -+ if (mcgstatus & MCG_STATUS_EIPV) -+ mce_snprintf(e->mcgstatus_msg, "EIPV"); -+ if (mcgstatus & MCG_STATUS_MCIP) -+ mce_snprintf(e->mcgstatus_msg, "MCIP"); -+ -+ decode_smca_error(e); -+ amd_decode_errcode(e); -+ return 0; -+} ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400 -@@ -0,0 +1,122 @@ -+/* -+ * Copyright (c) 2018, The AMD, Inc. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 and -+ * only version 2 as published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+ -+#include "ras-mce-handler.h" -+ -+/* Error Code Types */ -+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) -+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) -+ -+/* Error code: transaction type (TT) */ -+static char *transaction[] = { -+ "instruction", "data", "generic", "reserved" -+}; -+/* Error codes: cache level (LL) */ -+static char *cachelevel[] = { -+ "reserved", "L1", "L2", "L3/generic" -+}; -+/* Error codes: memory transaction type (RRRR) */ -+static char *memtrans[] = { -+ "generic", "generic read", "generic write", "data read", -+ "data write", "instruction fetch", "prefetch", "evict", "snoop", -+ "?", "?", "?", "?", "?", "?", "?" -+}; -+/* Participation Processor */ -+static char *partproc[] = { -+ "local node origin", "local node response", -+ "local node observed", "generic participation" -+}; -+/* Timeout */ -+static char *timeout[] = { -+ "request didn't time out", -+ "request timed out" -+}; -+/* internal unclassified error code */ -+static char *internal[] = { "reserved", -+ "reserved", -+ "hardware assert", -+ "reserved" }; -+ -+#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/ -+#define TT_MSG(x) transaction[TT(x)] -+#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/ -+#define LL_MSG(x) cachelevel[LL(x)] -+ -+#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */ -+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!") -+ -+#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/ -+#define TO_MSG(x) timeout[TO(x)] -+#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/ -+#define PP_MSG(x) partproc[PP(x)] -+ -+#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/ -+#define UU_MSG(x) internal[UU(x)] -+ -+void decode_amd_errcode(struct mce_event *e) -+{ -+ uint16_t ec = e->status & 0xffff; -+ uint16_t ecc = (e->status >> 45) & 0x3; -+ -+ if (e->status & MCI_STATUS_UC) { -+ if (e->status & MCI_STATUS_PCC) -+ strcpy(e->error_msg, "System Fatal error."); -+ if (e->mcgstatus & MCG_STATUS_RIPV) -+ strcpy(e->error_msg, -+ "Uncorrected, software restartable error."); -+ strcpy(e->error_msg, -+ "Uncorrected, software containable error."); -+ } else if (e->status & MCI_STATUS_DEFERRED) -+ strcpy(e->error_msg, "Deferred error, no action required."); -+ else -+ strcpy(e->error_msg, "Corrected error, no action required."); -+ -+ if (!(e->status & MCI_STATUS_VAL)) -+ mce_snprintf(e->mcistatus_msg, "MCE_INVALID"); -+ -+ if (e->status & MCI_STATUS_OVER) -+ mce_snprintf(e->mcistatus_msg, "Error_overflow"); -+ -+ if (e->status & MCI_STATUS_PCC) -+ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt"); -+ -+ if (ecc) -+ mce_snprintf(e->mcistatus_msg, -+ "%sECC", ((ecc == 2) ? "C" : "U")); -+ -+ if (INT_ERROR(ec)) { -+ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec)); -+ return; -+ } -+ -+ if (TLB_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "TLB Error 'tx: %s, level: %s'", -+ TT_MSG(ec), LL_MSG(ec)); -+ else if (MEM_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "Memory Error 'mem-tx: %s, tx: %s, level: %s'", -+ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec)); -+ else if (BUS_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "Bus Error '%s, %s, mem-tx: %s, level: %s'", -+ PP_MSG(ec), TO_MSG(ec), -+ R4_MSG(ec), LL_MSG(ec)); -+ return; -+ -+} ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400 -@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_LANDING] = "Knights Landing", - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", -+ [CPU_NAPLES] = "AMD Family 17h Zen1" - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -190,9 +191,12 @@ ret = 0; - if (!strcmp(mce->vendor, "AuthenticAMD")) { - if (mce->family == 15) - mce->cputype = CPU_K8; -- if (mce->family > 15) { -+ if (mce->family == 23) -+ mce->cputype = CPU_NAPLES; -+ if (mce->family > 23) { - log(ALL, LOG_INFO, -- "Can't parse MCE for this AMD CPU yet\n"); -+ "Can't parse MCE for this AMD CPU yet %d\n", -+ mce->family); - ret = EINVAL; - } - goto ret; -@@ -331,6 +335,12 @@ #if 0 - if (e->status & MCI_STATUS_ADDRV) - trace_seq_printf(s, ", addr= %llx", (long long)e->addr); - -+ if (e->status & MCI_STATUS_SYNDV) -+ trace_seq_printf(s, ", synd= %llx", (long long)e->synd); -+ -+ if (e->ipid) -+ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid); -+ - if (e->mcgstatus_msg) - trace_seq_printf(s, ", %s", e->mcgstatus_msg); - else -@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank - if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0) - return -1; - e.cpuvendor = val; -+ /* Get New entries */ -+ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0) -+ return -1; -+ e.synd = val; -+ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0) -+ return -1; -+ e.ipid = val; - - switch (mce->cputype) { - case CPU_GENERIC: -@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv - case CPU_K8: - rc = parse_amd_k8_event(ras, &e); - break; -+ case CPU_NAPLES: -+ rc = parse_amd_smca_event(ras, &e); -+ break; - default: /* All other CPU types are Intel */ - rc = parse_intel_event(ras, &e); - } ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400 -@@ -50,6 +50,7 @@ enum cputype { - CPU_KNIGHTS_LANDING, - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, -+ CPU_NAPLES, - }; - - struct mce_event { -@@ -69,6 +70,8 @@ struct mce_event { - uint8_t cs; - uint8_t bank; - uint8_t cpuvendor; -+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ -+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ - - /* Parsed data */ - char timestamp[64]; -@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra - void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); - void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e); - -+/* AMD error code decode function */ -+void decode_amd_errcode(struct mce_event *e); -+ - /* Software defined banks */ - #define MCE_EXTENDED_BANK 128 - -@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /* - #define MCI_STATUS_S (1ULL<<56) /* signalled */ - #define MCI_STATUS_AR (1ULL<<55) /* action-required */ - -+/* AMD-specific bits */ -+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ -+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ -+/* uncorrected error,deferred exception */ -+#define MCI_STATUS_DEFERRED (1ULL<<44) -+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ -+ - #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ - #define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */ - #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ -@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events - - int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e); - -+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e); -+ - #endif ---- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400 -+++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400 -@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT) - @WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \ - @WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ - @WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \ --@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c -+@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c - - @WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c - @WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c -@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c - mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \ - mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \ - mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \ -- non-standard-hisi_hip07.c -+ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c - @WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT) - @WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT) - @WITH_NON_STANDARD_TRUE@am__objects_3 = \ -@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c - @WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \ - @WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \ - @WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \ --@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) -+@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \ -+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \ -+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT) - @WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT) - @WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT) - @WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \ -@@ -595,6 +597,8 @@ distclean-compile: - - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@ -+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@ -+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@ diff --git a/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch b/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch deleted file mode 100644 index 38657d4..0000000 --- a/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch +++ /dev/null @@ -1,138 +0,0 @@ -commit a8c776ed94f68ae31d7b5f74e19545698898c13c -Author: Mauro Carvalho Chehab -Date: Tue Aug 14 13:06:27 2018 -0300 - - mce-intel-*: fix a warning when using FIELD(, NULL) - - Internally, FIELD() macro checks the size of an array, by - using ARRAY_SIZE. Well, this macro causes a division by zero - if NULL is used, as its type is void, as warned: - - mce-intel-dunnington.c:30:2: note: in expansion of macro ‘FIELD’ - FIELD(17, NULL), - ^~~~~ - ras-mce-handler.h:28:33: warning: division ‘sizeof (void *) / sizeof (void)’ does not compute the number of array elements [-Wsizeof-pointer-div] - #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) - ^ - bitfield.h:37:51: note: in expansion of macro ‘ARRAY_SIZE’ - #define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) } - ^~~~~~~~~~ - - While this warning is harmless, it may prevent seeing more serios - warnings. So, add a FIELD_NULL() macro to avoid that. - - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/bitfield.h b/bitfield.h -index c7dfeb1..fccbb36 100644 ---- a/bitfield.h -+++ b/bitfield.h -@@ -35,6 +35,7 @@ struct numfield { - }; - - #define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) } -+#define FIELD_NULL(start_bit) { start_bit, NULL, 0 } - #define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 } - - #define NUMBER(start, end, name) { start, end, name, "%Lu", 0 } -diff --git a/mce-intel-dunnington.c b/mce-intel-dunnington.c -index 4b1c7e3..c695c62 100644 ---- a/mce-intel-dunnington.c -+++ b/mce-intel-dunnington.c -@@ -27,14 +27,14 @@ - - static struct field dunnington_bus_status[] = { - SBITFIELD(16, "Parity error detected during FSB request phase"), -- FIELD(17, NULL), -+ FIELD_NULL(17), - SBITFIELD(20, "Hard Failure response received for a local transaction"), - SBITFIELD(21, "Parity error on FSB response field detected"), - SBITFIELD(22, "Parity data error on inbound data detected"), -- FIELD(23, NULL), -- FIELD(25, NULL), -- FIELD(28, NULL), -- FIELD(31, NULL), -+ FIELD_NULL(23), -+ FIELD_NULL(25), -+ FIELD_NULL(28), -+ FIELD_NULL(31), - {} - }; - -diff --git a/mce-intel-p4-p6.c b/mce-intel-p4-p6.c -index 4615e1a..5c6c3ff 100644 ---- a/mce-intel-p4-p6.c -+++ b/mce-intel-p4-p6.c -@@ -60,7 +60,7 @@ static char *bus_queue_error_type[] = { - }; - - static struct field p6_shared_status[] = { -- FIELD(16, NULL), -+ FIELD_NULL(16), - FIELD(19, bus_queue_req_type), - FIELD(25, bus_queue_error_type), - FIELD(25, bus_queue_error_type), -@@ -68,7 +68,7 @@ static struct field p6_shared_status[] = { - SBITFIELD(36, "received parity error on response transaction"), - SBITFIELD(38, "timeout BINIT (ROB timeout)." - " No micro-instruction retired for some time"), -- FIELD(39, NULL), -+ FIELD_NULL(39), - SBITFIELD(42, "bus transaction received hard error response"), - SBITFIELD(43, "failure that caused IERR"), - /* The following are reserved for Core in the SDM. Let's keep them here anyways*/ -@@ -76,15 +76,15 @@ static struct field p6_shared_status[] = { - SBITFIELD(45, "uncorrectable ECC error"), - SBITFIELD(46, "correctable ECC error"), - /* [47..54]: ECC syndrome */ -- FIELD(55, NULL), -+ FIELD_NULL(55), - {}, - }; - - static struct field p6old_status[] = { - SBITFIELD(28, "FRC error"), - SBITFIELD(29, "BERR on this CPU"), -- FIELD(31, NULL), -- FIELD(32, NULL), -+ FIELD_NULL(31), -+ FIELD_NULL(32), - SBITFIELD(35, "BINIT received from external bus"), - SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"), - {} -@@ -94,9 +94,9 @@ static struct field core2_status[] = { - SBITFIELD(28, "MCE driven"), - SBITFIELD(29, "MCE is observed"), - SBITFIELD(31, "BINIT observed"), -- FIELD(32, NULL), -+ FIELD_NULL(32), - SBITFIELD(34, "PIC or FSB data parity error"), -- FIELD(35, NULL), -+ FIELD_NULL(35), - SBITFIELD(37, "FSB address parity error detected"), - {} - }; -diff --git a/mce-intel-tulsa.c b/mce-intel-tulsa.c -index 6cea421..e59bf06 100644 ---- a/mce-intel-tulsa.c -+++ b/mce-intel-tulsa.c -@@ -39,7 +39,7 @@ static struct field tls_bus_status[] = { - SBITFIELD(16, "Parity error detected during FSB request phase"), - SBITFIELD(17, "Partity error detected on Core 0 request's address field"), - SBITFIELD(18, "Partity error detected on Core 1 request's address field"), -- FIELD(19, NULL), -+ FIELD_NULL(19), - SBITFIELD(20, "Parity error on FSB response field detected"), - SBITFIELD(21, "FSB data parity error on inbound date detected"), - SBITFIELD(22, "Data parity error on data received from Core 0 detected"), -@@ -48,8 +48,8 @@ static struct field tls_bus_status[] = { - SBITFIELD(25, "Data ECC event to error on inbound data correctable or uncorrectable"), - SBITFIELD(26, "Pad logic detected a data strobe glitch or sequencing error"), - SBITFIELD(27, "Pad logic detected a request strobe glitch or sequencing error"), -- FIELD(28, NULL), -- FIELD(31, NULL), -+ FIELD_NULL(28), -+ FIELD_NULL(31), - {} - }; - diff --git a/b22be68453b2497e86cbd273b9cd56fadc5859e3.patch b/b22be68453b2497e86cbd273b9cd56fadc5859e3.patch deleted file mode 100644 index 4b3b8ae..0000000 --- a/b22be68453b2497e86cbd273b9cd56fadc5859e3.patch +++ /dev/null @@ -1,37 +0,0 @@ -commit b22be68453b2497e86cbd273b9cd56fadc5859e3 -Author: Ying Lv -Date: Wed May 15 11:15:42 2019 +0800 - - fix rasdaemon high CPU usage when part of CPUs offline - - When we set part of CPU core offline, such as by setting the kernel cmdline - maxcpus = N(N is less than the total number of system CPU cores). - And then, we will observe that the CPU usage of some rasdaemon threads - is very close to 100. - - This is because when part of CPU offline, poll in read_ras_event_all_cpus func - will fallback to pthread way. - Offlined CPU thread will return negative value when read trace_pipe_raw, - negative return value will covert to positive value because of 'unsigned size'. - So code will always go into 'size > 0' branch, and the CPU usage is too high. - - Here, variable size uses int type will go to the right branch. - - Fiexs: eff7c9e0("ras-events: Only use pthreads for collect if poll() not available") - Reported-by: Zhipeng Xie - Signed-off-by: Ying Lv - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/ras-events.c b/ras-events.c -index 4e7b815..38ebe1e 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -426,7 +426,7 @@ static int read_ras_event(int fd, - struct kbuffer *kbuf, - void *page) - { -- unsigned size; -+ int size; - unsigned long long time_stamp; - void *data; - diff --git a/b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch b/b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch new file mode 100644 index 0000000..30cc19e --- /dev/null +++ b/b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch @@ -0,0 +1,30 @@ +commit b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4 +Author: DmNosachev +Date: Tue Jun 29 13:48:55 2021 +0300 + + labels/supermicro: added Supermicro X10DRI(-T) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 47ea05f..86e4617 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -81,4 +81,14 @@ Vendor: Supermicro + P2-DIMMC1: 2.2.0; + P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1; + P2-DIMME1: 3.1.0; +- P2-DIMMF1: 3.2.0; +\ No newline at end of file ++ P2-DIMMF1: 3.2.0; ++ ++ Model: X10DRI, X10DRI-T ++ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; ++ P1-DIMMB1: 0.1.0; P1-DIMMB2: 0.1.1; ++ P1-DIMMC1: 0.2.0; P1-DIMMC2: 0.2.1; ++ P1-DIMMD1: 0.3.0; P1-DIMMD2: 0.3.1; ++ P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1; ++ P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1; ++ P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1; ++ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1; +\ No newline at end of file diff --git a/b497a3d6a39d402c41065e9284d49114b97e3bfe.patch b/b497a3d6a39d402c41065e9284d49114b97e3bfe.patch deleted file mode 100644 index cbecbdc..0000000 --- a/b497a3d6a39d402c41065e9284d49114b97e3bfe.patch +++ /dev/null @@ -1,148 +0,0 @@ -commit b497a3d6a39d402c41065e9284d49114b97e3bfe -Author: Shiju Jose -Date: Mon Mar 8 16:57:28 2021 +0000 - - rasdaemon: ras-mc-ctl: Add memory failure events - - Add supporting memory failure errors (memory_failure_event) - to the ras-mc-ctl tool. - - Sample Log, - ras-mc-ctl --summary - ... - Memory failure events summary: - Delayed errors: 4 - Failed errors: 1 - ... - - ras-mc-ctl --errors - ... - Memory failure events: - 1 2020-10-28 23:20:41 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed - 2 2020-10-28 23:31:38 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed - 3 2020-10-28 23:54:54 -0800 error: pfn=0x205000000, page_type=free buddy page, action_result=Delayed - 4 2020-10-29 00:12:25 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed - 5 2020-10-29 00:26:36 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Failed - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - configure.ac | 11 +++++++++++ - util/ras-mc-ctl.in | 46 +++++++++++++++++++++++++++++++++++++++++++--- - 2 files changed, 54 insertions(+), 3 deletions(-) - ---- a/util/ras-mc-ctl.in 2021-10-13 13:51:00.887292563 -0400 -+++ b/util/ras-mc-ctl.in 2021-10-13 13:51:27.536061894 -0400 -@@ -44,11 +44,13 @@ my $modprobe = find_prog ("modprobe") - my $has_aer = 0; - my $has_arm = 0; - my $has_extlog = 0; -+my $has_mem_failure = 0; - my $has_mce = 0; - - @WITH_AER_TRUE@$has_aer = 1; - @WITH_ARM_TRUE@$has_arm = 1; - @WITH_EXTLOG_TRUE@$has_extlog = 1; -+@WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1; - @WITH_MCE_TRUE@$has_mce = 1; - - my %conf = (); -@@ -1132,7 +1134,7 @@ sub summary - { - require DBI; - my ($query, $query_handle, $out); -- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg); -+ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result); - my ($etype, $severity, $etype_string, $severity_string); - my ($affinity, $mpidr); - -@@ -1203,9 +1205,27 @@ sub summary - $out .= "\t$count $etype_string $severity_string errors\n"; - } - if ($out ne "") { -- print "Extlog records summary:\n$out"; -+ print "Extlog records summary:\n$out\n"; - } else { -- print "No Extlog errors.\n"; -+ print "No Extlog errors.\n\n"; -+ } -+ $query_handle->finish; -+ } -+ -+ # Memory failure errors -+ if ($has_mem_failure == 1) { -+ $query = "select action_result, count(*) from memory_failure_event group by action_result"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($action_result, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$action_result errors: $count\n"; -+ } -+ if ($out ne "") { -+ print "Memory failure events summary:\n$out\n"; -+ } else { -+ print "No Memory failure errors.\n\n"; - } - $query_handle->finish; - } -@@ -1238,6 +1258,7 @@ sub errors - my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); - my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); - my ($error_count, $affinity, $mpidr, $r_state, $psci_state); -+ my ($pfn, $page_type, $action_result); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1329,6 +1350,25 @@ $out .= sprintf "address=0x%08x, ", $add - } - $query_handle->finish; - } -+ -+ # Memory failure errors -+ if ($has_mem_failure == 1) { -+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n"; -+ } -+ if ($out ne "") { -+ print "Memory failure events:\n$out\n"; -+ } else { -+ print "No Memory failure errors.\n\n"; -+ } -+ $query_handle->finish; -+ } - - # MCE mce_record errors - if ($has_mce == 1) { ---- a/configure.ac 2018-04-25 06:28:51.000000000 -0400 -+++ b/configure.ac 2021-10-13 13:51:00.916292312 -0400 -@@ -80,6 +80,16 @@ AS_IF([test "x$enable_extlog" = "xyes"], - ]) - AM_CONDITIONAL([WITH_EXTLOG], [test x$enable_extlog = xyes]) - -+AC_ARG_ENABLE([memory_failure], -+ AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)])) -+ -+AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [ -+ AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect") -+ AC_SUBST([WITH_MEMORY_FAILURE]) -+]) -+AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all == xyes]) -+AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"]) -+ - AC_ARG_ENABLE([abrt_report], - AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) - -@@ -127,4 +137,5 @@ compile time options summary - ABRT report : $enable_abrt_report - HIP07 SAS HW errors : $enable_hisi_ns_decode - ARM events : $enable_arm -+ Memory Failure : $USE_MEMORY_FAILURE - EOF diff --git a/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch b/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch deleted file mode 100644 index 36c019d..0000000 --- a/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch +++ /dev/null @@ -1,94 +0,0 @@ -commit cc2ce5c65ed5a42eaa97aa3659854add6d808da5 -Author: Muralidhara M K -Date: Mon Jan 13 19:12:06 2020 +0530 - - rasdaemon: Add error decoding for new SMCA Load Store bank type - - Future Scalable Machine Check Architecture (SMCA) systems will have a - new Load Store bank type. - - Add the new type's (HWID, McaType) ID and error decoding. - - Signed-off-by: Muralidhara M K - [ Adjust commit message. ] - Signed-off-by: Yazen Ghannam - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 114e786..d0b6cb6 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -38,9 +38,16 @@ - * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0] - */ - -+/* MCA_STATUS REGISTER FOR FAMILY 19H -+ * The bits 24 ~ 29 contains AddressLsb -+ * 29: ADDRLS[5], 28: ADDRLS[4], 27: ADDRLS[3], -+ * 26: ADDRLS[2], 25: ADDRLS[1], 24: ADDRLS[0] -+ */ -+ - /* These may be used by multiple smca_hwid_mcatypes */ - enum smca_bank_types { - SMCA_LS = 0, /* Load Store */ -+ SMCA_LS_V2, /* Load Store */ - SMCA_IF, /* Instruction Fetch */ - SMCA_L2_CACHE, /* L2 Cache */ - SMCA_DE, /* Decoder Unit */ -@@ -88,6 +95,32 @@ static const char * const smca_ls_mce_desc[] = { - "DC tag error type 5", - "L2 fill data error", - }; -+static const char * const smca_ls2_mce_desc[] = { -+ "An ECC error was detected on a data cache read by a probe or victimization", -+ "An ECC error or L2 poison was detected on a data cache read by a load", -+ "An ECC error was detected on a data cache read-modify-write by a store", -+ "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", -+ "An ECC error or poison bit mismatch was detected on a tag read by a load", -+ "An ECC error or poison bit mismatch was detected on a tag read by a store", -+ "An ECC error was detected on an EMEM read by a load", -+ "An ECC error was detected on an EMEM read-modify-write by a store", -+ "A parity error was detected in an L1 TLB entry by any access", -+ "A parity error was detected in an L2 TLB entry by any access", -+ "A parity error was detected in a PWC entry by any access", -+ "A parity error was detected in an STQ entry by any access", -+ "A parity error was detected in an LDQ entry by any access", -+ "A parity error was detected in a MAB entry by any access", -+ "A parity error was detected in an SCB entry state field by any access", -+ "A parity error was detected in an SCB entry address field by any access", -+ "A parity error was detected in an SCB entry data field by any access", -+ "A parity error was detected in a WCB entry by any access", -+ "A poisoned line was detected in an SCB entry by any access", -+ "A SystemReadDataError error was reported on read data returned from L2 for a load", -+ "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", -+ "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", -+ "A hardware assertion error was reported", -+ "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", -+}; - /* Instruction Fetch */ - static const char * const smca_if_mce_desc[] = { - "microtag probe port parity error", -@@ -289,6 +322,7 @@ struct smca_mce_desc { - - static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, -+ [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, - [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, - [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, - [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, -@@ -319,6 +353,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, 0x000000B0 }, -+ { SMCA_LS_V2, 0x001000B0 }, - { SMCA_IF, 0x000100B0 }, - { SMCA_L2_CACHE, 0x000200B0 }, - { SMCA_DE, 0x000300B0 }, -@@ -362,6 +397,7 @@ struct smca_bank_name { - - static struct smca_bank_name smca_names[] = { - [SMCA_LS] = { "Load Store Unit" }, -+ [SMCA_LS_V2] = { "Load Store Unit" }, - [SMCA_IF] = { "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "L2 Cache" }, - [SMCA_DE] = { "Decode Unit" }, diff --git a/ce6e7864f11f709c4f803828fbc8e507d115d03b.patch b/ce6e7864f11f709c4f803828fbc8e507d115d03b.patch deleted file mode 100644 index e10c156..0000000 --- a/ce6e7864f11f709c4f803828fbc8e507d115d03b.patch +++ /dev/null @@ -1,611 +0,0 @@ -commit ce6e7864f11f709c4f803828fbc8e507d115d03b -Author: Greg Edwards -Date: Thu Apr 8 15:03:30 2021 -0600 - - rasdaemon: Add Ice Lake and Sapphire Rapids MSCOD values - - Based on mcelog commits: - - ee90ff20ce6a ("mcelog: Add support for Icelake server, Icelake-D, and Snow Ridge") - 391abaac9bdf ("mcelog: Add decode for MCi_MISC from 10nm memory controller") - 59cb7ad4bc72 ("mcelog: i10nm: Fix mapping from bank number to functional unit") - c0acd0e6a639 ("mcelog: Add support for Sapphirerapids server.") - - Signed-off-by: Greg Edwards - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 3 - mce-intel-i10nm.c | 509 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - mce-intel.c | 5 - ras-mce-handler.c | 12 + - ras-mce-handler.h | 5 - 5 files changed, 533 insertions(+), 1 deletion(-) - ---- rasdaemon-0.6.1.orig/Makefile.am 2021-09-17 15:29:45.977790658 -0400 -+++ rasdaemon-0.6.1/Makefile.am 2021-09-17 15:29:57.439698580 -0400 -@@ -36,7 +36,8 @@ if WITH_MCE - mce-intel-dunnington.c mce-intel-tulsa.c \ - mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ - mce-intel-knl.c mce-intel-broadwell-de.c \ -- mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c -+ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c \ -+ mce-amd.c mce-amd-smca.c mce-intel-i10nm.c - endif - if WITH_EXTLOG - rasdaemon_SOURCES += ras-extlog-handler.c ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-intel-i10nm.c 2021-09-17 15:29:45.977790658 -0400 -@@ -0,0 +1,509 @@ -+/* -+ * The code below came from Tony Luck's mcelog code, -+ * released under GNU Public General License, v.2 -+ * -+ * Copyright (C) 2019 Intel Corporation -+ * Decode Intel 10nm specific machine check errors. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+*/ -+ -+#include -+#include -+#include -+ -+#include "ras-mce-handler.h" -+#include "bitfield.h" -+ -+static char *pcu_1[] = { -+ [0x0D] = "MCA_LLC_BIST_ACTIVE_TIMEOUT", -+ [0x0E] = "MCA_DMI_TRAINING_TIMEOUT", -+ [0x0F] = "MCA_DMI_STRAP_SET_ARRIVAL_TIMEOUT", -+ [0x10] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT", -+ [0x11] = "MCA_MORE_THAN_ONE_LT_AGENT", -+ [0x14] = "MCA_INCOMPATIBLE_PCH_TYPE", -+ [0x1E] = "MCA_BIOS_RST_CPL_INVALID_SEQ", -+ [0x1F] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG", -+ [0x2D] = "MCA_PCU_PMAX_CALIB_ERROR", -+ [0x2E] = "MCA_TSC100_SYNC_TIMEOUT", -+ [0x3A] = "MCA_GPSB_TIMEOUT", -+ [0x3B] = "MCA_PMSB_TIMEOUT", -+ [0x3E] = "MCA_IOSFSB_PMREQ_CMP_TIMEOUT", -+ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE", -+ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE", -+ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR", -+ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED", -+ [0x45] = "MCA_SVID_SA_ITD_ERROR", -+ [0x46] = "MCA_SVID_READ_REG_FAILED", -+ [0x47] = "MCA_SVID_WRITE_REG_FAILED", -+ [0x4A] = "MCA_SVID_PKGC_REQUEST_FAILED", -+ [0x4B] = "MCA_SVID_IMON_REQUEST_FAILED", -+ [0x4C] = "MCA_SVID_ALERT_REQUEST_FAILED", -+ [0x4D] = "MCA_SVID_MCP_VR_RAMP_ERROR", -+ [0x56] = "MCA_FIVR_PD_HARDERR", -+ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE", -+ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER", -+ [0x5A] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER", -+ [0x5B] = "MCA_WATCHDOG_TIMEOUT_MSG_CH_FSM", -+ [0x5C] = "MCA_WATCHDOG_TIMEOUT_BULK_CR_FSM", -+ [0x5D] = "MCA_WATCHDOG_TIMEOUT_IOSFSB_FSM", -+ [0x60] = "MCA_PKGS_SAFE_WP_TIMEOUT", -+ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT", -+ [0x62] = "MCA_PKGS_INVALID_REQ_PCH", -+ [0x63] = "MCA_PKGS_INVALID_REQ_INTERNAL", -+ [0x64] = "MCA_PKGS_INVALID_RSP_INTERNAL", -+ [0x65 ... 0x7A] = "MCA_PKGS_RESET_PREP_TIMEOUT", -+ [0x7B] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT", -+ [0x7C] = "MCA_PKGS_SMBUS_MCP_PAUSE_TIMEOUT", -+ [0x7D] = "MCA_PKGS_SMBUS_SPD_PAUSE_TIMEOUT", -+ [0x80] = "MCA_PKGC_DISP_BUSY_TIMEOUT", -+ [0x81] = "MCA_PKGC_INVALID_RSP_PCH", -+ [0x83] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN", -+ [0x84] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP", -+ [0x87] = "MCA_PKGC_WATCHDOG_HANG_C2_BLKMASTER", -+ [0x88] = "MCA_PKGC_WATCHDOG_HANG_C2_PSLIMIT", -+ [0x89] = "MCA_PKGC_WATCHDOG_HANG_SETDISP", -+ [0x8B] = "MCA_PKGC_ALLOW_L1_ERROR", -+ [0x90] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT", -+ [0xA0] = "MCA_ADR_SIGNAL_TIMEOUT", -+ [0xA1] = "MCA_BCLK_FREQ_OC_ABOVE_THRESHOLD", -+ [0xB0] = "MCA_DISPATCHER_RUN_BUSY_TIMEOUT", -+}; -+ -+static char *pcu_2[] = { -+ [0x04] = "Clock/power IP response timeout", -+ [0x05] = "SMBus controller raised SMI", -+ [0x09] = "PM controller received invalid transaction", -+}; -+ -+static char *pcu_3[] = { -+ [0x01] = "Instruction address out of valid space", -+ [0x02] = "Double bit RAM error on Instruction Fetch", -+ [0x03] = "Invalid OpCode seen", -+ [0x04] = "Stack Underflow", -+ [0x05] = "Stack Overflow", -+ [0x06] = "Data address out of valid space", -+ [0x07] = "Double bit RAM error on Data Fetch", -+}; -+ -+static struct field pcu1[] = { -+ FIELD(0, pcu_1), -+ {} -+}; -+ -+static struct field pcu2[] = { -+ FIELD(0, pcu_2), -+ {} -+}; -+ -+static struct field pcu3[] = { -+ FIELD(0, pcu_3), -+ {} -+}; -+ -+static struct field upi1[] = { -+ SBITFIELD(22, "Phy Control Error"), -+ SBITFIELD(23, "Unexpected Retry.Ack flit"), -+ SBITFIELD(24, "Unexpected Retry.Req flit"), -+ SBITFIELD(25, "RF parity error"), -+ SBITFIELD(26, "Routeback Table error"), -+ SBITFIELD(27, "Unexpected Tx Protocol flit (EOP, Header or Data)"), -+ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"), -+ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"), -+ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"), -+ SBITFIELD(31, "Link Layer Tx Parity Error"), -+ {} -+}; -+ -+static char *upi_2[] = { -+ [0x00] = "Phy Initialization Failure (NumInit)", -+ [0x01] = "Phy Detected Drift Buffer Alarm", -+ [0x02] = "Phy Detected Latency Buffer Rollover", -+ [0x10] = "LL Rx detected CRC error: unsuccessful LLR (entered Abort state)", -+ [0x11] = "LL Rx Unsupported/Undefined packet", -+ [0x12] = "LL or Phy Control Error", -+ [0x13] = "LL Rx Parameter Exception", -+ [0x1F] = "LL Detected Control Error", -+ [0x20] = "Phy Initialization Abort", -+ [0x21] = "Phy Inband Reset", -+ [0x22] = "Phy Lane failure, recovery in x8 width", -+ [0x23] = "Phy L0c error corrected without Phy reset", -+ [0x24] = "Phy L0c error triggering Phy reset", -+ [0x25] = "Phy L0p exit error corrected with reset", -+ [0x30] = "LL Rx detected CRC error: successful LLR without Phy Reinit", -+ [0x31] = "LL Rx detected CRC error: successful LLR with Phy Reinit", -+ [0x32] = "Tx received LLR", -+}; -+ -+static struct field upi2[] = { -+ FIELD(0, upi_2), -+ {} -+}; -+ -+static struct field m2m[] = { -+ SBITFIELD(16, "MC read data error"), -+ SBITFIELD(17, "Reserved"), -+ SBITFIELD(18, "MC partial write data error"), -+ SBITFIELD(19, "Full write data error"), -+ SBITFIELD(20, "M2M clock-domain-crossing buffer (BGF) error"), -+ SBITFIELD(21, "M2M time out"), -+ SBITFIELD(22, "M2M tracker parity error"), -+ SBITFIELD(23, "fatal Bucket1 error"), -+ {} -+}; -+ -+static char *imc_0[] = { -+ [0x01] = "Address parity error", -+ [0x02] = "Data parity error", -+ [0x03] = "Data ECC error", -+ [0x04] = "Data byte enable parity error", -+ [0x07] = "Transaction ID parity error", -+ [0x08] = "Corrected patrol scrub error", -+ [0x10] = "Uncorrected patrol scrub error", -+ [0x20] = "Corrected spare error", -+ [0x40] = "Uncorrected spare error", -+ [0x80] = "Corrected read error", -+ [0xA0] = "Uncorrected read error", -+ [0xC0] = "Uncorrected metadata", -+}; -+ -+static char *imc_1[] = { -+ [0x00] = "WDB read parity error", -+ [0x03] = "RPA parity error", -+ [0x06] = "DDR_T_DPPP data BE error", -+ [0x07] = "DDR_T_DPPP data error", -+ [0x08] = "DDR link failure", -+ [0x11] = "PCLS CAM error", -+ [0x12] = "PCLS data error", -+}; -+ -+static char *imc_2[] = { -+ [0x00] = "DDR4 command / address parity error", -+ [0x20] = "HBM command / address parity error", -+ [0x21] = "HBM data parity error", -+}; -+ -+static char *imc_4[] = { -+ [0x00] = "RPQ parity (primary) error", -+}; -+ -+static char *imc_8[] = { -+ [0x00] = "DDR-T bad request", -+ [0x01] = "DDR Data response to an invalid entry", -+ [0x02] = "DDR data response to an entry not expecting data", -+ [0x03] = "DDR4 completion to an invalid entry", -+ [0x04] = "DDR-T completion to an invalid entry", -+ [0x05] = "DDR data/completion FIFO overflow", -+ [0x06] = "DDR-T ERID correctable parity error", -+ [0x07] = "DDR-T ERID uncorrectable error", -+ [0x08] = "DDR-T interrupt received while outstanding interrupt was not ACKed", -+ [0x09] = "ERID FI FO overflow", -+ [0x0A] = "DDR-T error on FNV write credits", -+ [0x0B] = "DDR-T error on FNV read credits", -+ [0x0C] = "DDR-T scheduler error", -+ [0x0D] = "DDR-T FNV error event", -+ [0x0E] = "DDR-T FNV thermal event", -+ [0x0F] = "CMI packet while idle", -+ [0x10] = "DDR_T_RPQ_REQ_PARITY_ERR", -+ [0x11] = "DDR_T_WPQ_REQ_PARITY_ERR", -+ [0x12] = "2LM_NMFILLWR_CAM_ERR", -+ [0x13] = "CMI_CREDIT_OVERSUB_ERR", -+ [0x14] = "CMI_CREDIT_TOTAL_ERR", -+ [0x15] = "CMI_CREDIT_RSVD_POOL_ERR", -+ [0x16] = "DDR_T_RD_ERROR", -+ [0x17] = "WDB_FIFO_ERR", -+ [0x18] = "CMI_REQ_FIFO_OVERFLOW", -+ [0x19] = "CMI_REQ_FIFO_UNDERFLOW", -+ [0x1A] = "CMI_RSP_FIFO_OVERFLOW", -+ [0x1B] = "CMI_RSP_FIFO_UNDERFLOW", -+ [0x1C] = "CMI _MISC_MC_CRDT_ERRORS", -+ [0x1D] = "CMI_MISC_MC_ARB_ERRORS", -+ [0x1E] = "DDR_T_WR_CMPL_FI FO_OVERFLOW", -+ [0x1F] = "DDR_T_WR_CMPL_FI FO_UNDERFLOW", -+ [0x20] = "CMI_RD_CPL_FIFO_OVERFLOW", -+ [0x21] = "CMI_RD_CPL_FIFO_UNDERFLOW", -+ [0x22] = "TME_KEY_PAR_ERR", -+ [0x23] = "TME_CMI_MISC_ERR", -+ [0x24] = "TME_CMI_OVFL_ERR", -+ [0x25] = "TME_CMI_UFL_ERR", -+ [0x26] = "TME_TEM_SECURE_ERR", -+ [0x27] = "TME_UFILL_PAR_ERR", -+ [0x29] = "INTERNAL_ERR", -+ [0x2A] = "TME_INTEGRITY_ERR", -+ [0x2B] = "TME_TDX_ERR", -+ [0x2C] = "TME_UFILL_TEM_SECURE_ERR", -+ [0x2D] = "TME_KEY_POISON_ERR", -+ [0x2E] = "TME_SECURITY_ENGINE_ERR", -+}; -+ -+static char *imc_10[] = { -+ [0x08] = "CORR_PATSCRUB_MIRR2ND_ERR", -+ [0x10] = "UC_PATSCRUB_MIRR2ND_ERR", -+ [0x20] = "COR_SPARE_MIRR2ND_ERR", -+ [0x40] = "UC_SPARE_MIRR2ND_ERR", -+ [0x80] = "HA_RD_MIRR2ND_ERR", -+ [0xA0] = "HA_UNCORR_RD_MIRR2ND_ERR", -+}; -+ -+static struct field imc0[] = { -+ FIELD(0, imc_0), -+ {} -+}; -+ -+static struct field imc1[] = { -+ FIELD(0, imc_1), -+ {} -+}; -+ -+static struct field imc2[] = { -+ FIELD(0, imc_2), -+ {} -+}; -+ -+static struct field imc4[] = { -+ FIELD(0, imc_4), -+ {} -+}; -+ -+static struct field imc8[] = { -+ FIELD(0, imc_8), -+ {} -+}; -+ -+static struct field imc10[] = { -+ FIELD(0, imc_10), -+ {} -+}; -+ -+static void i10nm_imc_misc(struct mce_event *e) -+{ -+ uint32_t column = EXTRACT(e->misc, 9, 18) << 2; -+ uint32_t row = EXTRACT(e->misc, 19, 39); -+ uint32_t bank = EXTRACT(e->misc, 42, 43); -+ uint32_t bankgroup = EXTRACT(e->misc, 40, 41) | (EXTRACT(e->misc, 44, 44) << 2); -+ uint32_t fdevice = EXTRACT(e->misc, 46, 51); -+ uint32_t subrank = EXTRACT(e->misc, 52, 55); -+ uint32_t rank = EXTRACT(e->misc, 56, 58); -+ uint32_t eccmode = EXTRACT(e->misc, 59, 62); -+ uint32_t transient = EXTRACT(e->misc, 63, 63); -+ -+ mce_snprintf(e->error_msg, "bank: 0x%x bankgroup: 0x%x row: 0x%x column: 0x%x", bank, bankgroup, row, column); -+ if (!transient && !EXTRACT(e->status, 61, 61)) -+ mce_snprintf(e->error_msg, "failed device: 0x%x", fdevice); -+ mce_snprintf(e->error_msg, "rank: 0x%x subrank: 0x%x", rank, subrank); -+ mce_snprintf(e->error_msg, "ecc mode: "); -+ switch (eccmode) { -+ case 0: mce_snprintf(e->error_msg, "SDDC memory mode"); break; -+ case 1: mce_snprintf(e->error_msg, "SDDC"); break; -+ case 4: mce_snprintf(e->error_msg, "ADDDC memory mode"); break; -+ case 5: mce_snprintf(e->error_msg, "ADDDC"); break; -+ case 8: mce_snprintf(e->error_msg, "DDRT read"); break; -+ default: mce_snprintf(e->error_msg, "unknown"); break; -+ } -+ if (transient) -+ mce_snprintf(e->error_msg, "transient"); -+} -+ -+enum banktype { -+ BT_UNKNOWN, -+ BT_PCU, -+ BT_UPI, -+ BT_M2M, -+ BT_IMC, -+}; -+ -+static enum banktype icelake[32] = { -+ [4] = BT_PCU, -+ [5] = BT_UPI, -+ [7 ... 8] = BT_UPI, -+ [12] = BT_M2M, -+ [16] = BT_M2M, -+ [20] = BT_M2M, -+ [24] = BT_M2M, -+ [13 ... 15] = BT_IMC, -+ [17 ... 19] = BT_IMC, -+ [21 ... 23] = BT_IMC, -+ [25 ... 27] = BT_IMC, -+}; -+ -+static enum banktype icelake_de[32] = { -+ [4] = BT_PCU, -+ [12] = BT_M2M, -+ [16] = BT_M2M, -+ [13 ... 15] = BT_IMC, -+ [17 ... 19] = BT_IMC, -+}; -+ -+static enum banktype tremont[32] = { -+ [4] = BT_PCU, -+ [12] = BT_M2M, -+ [13 ... 15] = BT_IMC, -+}; -+ -+static enum banktype sapphire[32] = { -+ [4] = BT_PCU, -+ [5] = BT_UPI, -+ [12] = BT_M2M, -+ [13 ... 20] = BT_IMC, -+}; -+ -+void i10nm_memerr_misc(struct mce_event *e, int *channel); -+ -+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, -+ struct mce_event *e) -+{ -+ enum banktype banktype; -+ uint64_t f, status = e->status; -+ uint32_t mca = status & 0xffff; -+ int channel = -1; -+ -+ switch (cputype) { -+ case CPU_ICELAKE_XEON: -+ banktype = icelake[e->bank]; -+ break; -+ case CPU_ICELAKE_DE: -+ banktype = icelake_de[e->bank]; -+ break; -+ case CPU_TREMONT_D: -+ banktype = tremont[e->bank]; -+ break; -+ case CPU_SAPPHIRERAPIDS: -+ banktype = sapphire[e->bank]; -+ break; -+ default: -+ return; -+ } -+ -+ switch (banktype) { -+ case BT_UNKNOWN: -+ break; -+ -+ case BT_PCU: -+ mce_snprintf(e->error_msg, "PCU: "); -+ f = EXTRACT(status, 24, 31); -+ if (f) -+ decode_bitfield(e, f, pcu1); -+ f = EXTRACT(status, 20, 23); -+ if (f) -+ decode_bitfield(e, f, pcu2); -+ f = EXTRACT(status, 16, 19); -+ if (f) -+ decode_bitfield(e, f, pcu3); -+ break; -+ -+ case BT_UPI: -+ mce_snprintf(e->error_msg, "UPI: "); -+ f = EXTRACT(status, 22, 31); -+ if (f) -+ decode_bitfield(e, status, upi1); -+ f = EXTRACT(status, 16, 21); -+ decode_bitfield(e, f, upi2); -+ break; -+ -+ case BT_M2M: -+ mce_snprintf(e->error_msg, "M2M: "); -+ f = EXTRACT(status, 24, 25); -+ mce_snprintf(e->error_msg, "MscodDDRType=0x%" PRIx64, f); -+ f = EXTRACT(status, 26, 31); -+ mce_snprintf(e->error_msg, "MscodMiscErrs=0x%" PRIx64, f); -+ decode_bitfield(e, status, m2m); -+ break; -+ -+ case BT_IMC: -+ mce_snprintf(e->error_msg, "MemCtrl: "); -+ f = EXTRACT(status, 16, 23); -+ switch (EXTRACT(status, 24, 31)) { -+ case 0: decode_bitfield(e, f, imc0); break; -+ case 1: decode_bitfield(e, f, imc1); break; -+ case 2: decode_bitfield(e, f, imc2); break; -+ case 4: decode_bitfield(e, f, imc4); break; -+ case 8: decode_bitfield(e, f, imc8); break; -+ case 0x10: decode_bitfield(e, f, imc10); break; -+ } -+ i10nm_imc_misc(e); -+ break; -+ } -+ -+ /* -+ * Memory error specific code. Returns if the error is not a MC one -+ */ -+ -+ /* Check if the error is at the memory controller */ -+ if ((mca >> 7) != 1) -+ return; -+ -+ /* Ignore unless this is an corrected extended error from an iMC bank */ -+ if (banktype != BT_IMC || (status & MCI_STATUS_UC)) -+ return; -+ -+ /* -+ * Parse the reported channel -+ */ -+ -+ i10nm_memerr_misc(e, &channel); -+ if (channel == -1) -+ return; -+ mce_snprintf(e->mc_location, "memory_channel=%d", channel); -+} -+ -+/* -+ * There isn't enough information to identify the DIMM. But -+ * we can derive the channel from the bank number. -+ * There can be four memory controllers with two channels each. -+ */ -+void i10nm_memerr_misc(struct mce_event *e, int *channel) -+{ -+ uint64_t status = e->status; -+ unsigned int chan, imc; -+ -+ /* Check this is a memory error */ -+ if (!test_prefix(7, status & 0xefff)) -+ return; -+ -+ chan = EXTRACT(status, 0, 3); -+ if (chan == 0xf) -+ return; -+ -+ switch (e->bank) { -+ case 12: /* M2M 0 */ -+ case 13: /* IMC 0, Channel 0 */ -+ case 14: /* IMC 0, Channel 1 */ -+ case 15: /* IMC 0, Channel 2 */ -+ imc = 0; -+ break; -+ case 16: /* M2M 1 */ -+ case 17: /* IMC 1, Channel 0 */ -+ case 18: /* IMC 1, Channel 1 */ -+ case 19: /* IMC 1, Channel 2 */ -+ imc = 1; -+ break; -+ case 20: /* M2M 2 */ -+ case 21: /* IMC 2, Channel 0 */ -+ case 22: /* IMC 2, Channel 1 */ -+ case 23: /* IMC 2, Channel 2 */ -+ imc = 2; -+ break; -+ case 24: /* M2M 3 */ -+ case 25: /* IMC 3, Channel 0 */ -+ case 26: /* IMC 3, Channel 1 */ -+ case 27: /* IMC 3, Channel 2 */ -+ imc = 3; -+ break; -+ default: -+ return; -+ } -+ -+ channel[0] = imc * 3 + chan; -+} ---- rasdaemon-0.6.1.orig/mce-intel.c 2021-09-17 15:29:39.189845188 -0400 -+++ rasdaemon-0.6.1/mce-intel.c 2021-09-17 15:29:45.977790658 -0400 -@@ -411,6 +411,11 @@ if (test_prefix(11, (e->status & 0xffffL - case CPU_SKYLAKE_XEON: - skylake_s_decode_model(ras, e); - break; -+ case CPU_ICELAKE_XEON: -+ case CPU_ICELAKE_DE: -+ case CPU_TREMONT_D: -+ case CPU_SAPPHIRERAPIDS: -+ i10nm_decode_model(mce->cputype, ras, e); - default: - break; - } ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-09-17 15:29:39.189845188 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-09-17 15:29:45.977790658 -0400 -@@ -56,6 +56,10 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", - [CPU_AMD_SMCA] = "AMD Scalable MCA", -+ [CPU_ICELAKE_XEON] = "Icelake server", -+ [CPU_ICELAKE_DE] = "Icelake server D Family", -+ [CPU_TREMONT_D] = "Tremont microserver", -+ [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server", - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -107,6 +111,14 @@ else if (mce->model == 0x85) - return CPU_KNIGHTS_MILL; - else if (mce->model == 0x55) - return CPU_SKYLAKE_XEON; -+ else if (mce->model == 0x6a) -+ return CPU_ICELAKE_XEON; -+ else if (mce->model == 0x6c) -+ return CPU_ICELAKE_DE; -+ else if (mce->model == 0x86) -+ return CPU_TREMONT_D; -+ else if (mce->model == 0x8f) -+ return CPU_SAPPHIRERAPIDS; - - if (mce->model > 0x1a) { - log(ALL, LOG_INFO, ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-09-17 15:29:39.189845188 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-09-17 15:29:45.977790658 -0400 -@@ -51,6 +51,10 @@ enum cputype { - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, - CPU_AMD_SMCA, -+ CPU_ICELAKE_XEON, -+ CPU_ICELAKE_DE, -+ CPU_TREMONT_D, -+ CPU_SAPPHIRERAPIDS, - }; - - struct mce_event { -@@ -131,6 +135,7 @@ void tulsa_decode_model(struct mce_event - void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e); - void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); - void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e); -+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, struct mce_event *e); - - /* AMD error code decode function */ - void decode_amd_errcode(struct mce_event *e); diff --git a/d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch b/d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch new file mode 100644 index 0000000..d28ce9c --- /dev/null +++ b/d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch @@ -0,0 +1,42 @@ +commit d0e0bb3d73c4bc5060da20270a089857bba2a64c +Author: Justin Vreeland +Date: Tue Nov 2 19:51:50 2021 -0700 + + Update ras-mc-ctl manpage to match current options + + Signed-off-by: Justin Vreeland + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/man/ras-mc-ctl.8.in b/man/ras-mc-ctl.8.in +index 26230e0..a605122 100644 +--- a/man/ras-mc-ctl.8.in ++++ b/man/ras-mc-ctl.8.in +@@ -79,9 +79,27 @@ Specify an alternate location for the labels database. + Specify a delay of \fBtime\fR seconds before registering DIMM labels. + Only meaninful if used together with --register-labels. + .TP +-.BI "--layout ++.BI "--layout" + Prints the memory layout as detected by the EDAC driver. Useful to check + if the EDAC driver is properly detecting the memory controller architecture. ++.TP ++.BI "--summary" ++Presents a summary of the logged errors. ++.TP ++.BI "--errors" ++Shows the errors stored at the error database. ++.TP ++.BI "--error-count" ++Shows the corrected and uncorrected error counts using sysfs. ++.TP ++.BI "--vendor-errors-summary="platform-id ++Pressents a summary of the vendor-specific logged errors. ++.TP ++.BI "--vendor-errors="platform-id ++Shows the vendor-specific errors stored in the error database. ++.TP ++.BI "--vendor-platforms" ++Shows the supported platforms with platform-ids for the vendor-specific errors. + + .SH MAINBOARD CONFIGURATION + .PP diff --git a/dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch b/dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch new file mode 100644 index 0000000..b9eec5a --- /dev/null +++ b/dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch @@ -0,0 +1,27 @@ +commit dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b +Author: Mauro Carvalho Chehab +Date: Wed May 26 12:55:54 2021 +0200 + + Add support for multi-arch builds + + Allow building rasdaemon on several architectures: + - x86_64 + - arm 64 + - ppc 64 LE + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml +index 747a844..898687c 100644 +--- a/.github/workflows/ci.yml ++++ b/.github/workflows/ci.yml +@@ -9,6 +9,9 @@ jobs: + Ubuntu: + name: Ubuntu + runs-on: ubuntu-latest ++ strategy: ++ matrix: ++ arch: [x64_64, aarch64, ppc64le] + steps: + - uses: actions/checkout@v2 + - name: prepare diff --git a/dist b/dist new file mode 100644 index 0000000..89c1faf --- /dev/null +++ b/dist @@ -0,0 +1 @@ +an9 diff --git a/download b/download index 6feb854..9d0b9df 100644 --- a/download +++ b/download @@ -1 +1 @@ -dc388ad15889efe295184277ad7c2860 rasdaemon-0.6.1.tar.bz2 +8404c50ab6ba72f41e9c948b8ac3c2cb rasdaemon-0.6.7.tar.bz2 diff --git a/ec443ec0add059fa897f844349e1a2345d81713c.patch b/ec443ec0add059fa897f844349e1a2345d81713c.patch new file mode 100644 index 0000000..cf778c1 --- /dev/null +++ b/ec443ec0add059fa897f844349e1a2345d81713c.patch @@ -0,0 +1,31 @@ +commit ec443ec0add059fa897f844349e1a2345d81713c +Author: DmNosachev +Date: Tue Jun 29 11:33:10 2021 +0300 + + labels/supermicro: added x11dph-i labels + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 3fd6fee..bfaed93 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -68,3 +68,17 @@ Vendor: Supermicro + P1_DIMM4B: 1.1.1; + P2_DIMM4B: 2.0.1; + P2_DIMM4B: 2.1.1; ++ ++ Model: X11DPH-i ++ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; ++ P1-DIMMB1: 0.1.0; ++ P1-DIMMC1: 0.2.0; ++ P1-DIMMD1: 1.0.0; P1-DIMMD2: 1.0.1; ++ P1-DIMME1: 1.1.0; ++ P1-DIMMF1: 1.2.0; ++ P2-DIMMA1: 2.0.0; P2-DIMMA2: 2.0.1; ++ P2-DIMMB1: 2.1.0; ++ P2-DIMMC1: 2.2.0; ++ P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1; ++ P2-DIMME1: 3.1.0; ++ P2-DIMMF1: 3.2.0; +\ No newline at end of file diff --git a/f7cdd720297cd17e405a7170c04df89d1d9536f8.patch b/f7cdd720297cd17e405a7170c04df89d1d9536f8.patch new file mode 100644 index 0000000..c2732e8 --- /dev/null +++ b/f7cdd720297cd17e405a7170c04df89d1d9536f8.patch @@ -0,0 +1,48 @@ +commit f7cdd720297cd17e405a7170c04df89d1d9536f8 +Author: Mauro Carvalho Chehab +Date: Wed May 26 12:35:55 2021 +0200 + + Add a github workflow for CI automation + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml +new file mode 100644 +index 0000000..5b3e757 +--- /dev/null ++++ b/.github/workflows/ci.yml +@@ -0,0 +1,34 @@ ++name: CI ++ ++# Should run only on branches and PR, as "on_tag.yml" will handle tags ++on: ++ push: ++ branches: master test ++ pull_request: ++ branches: master ++ ++jobs: ++ ++# ++# Linux ++# ++ Ubuntu: ++ name: Ubuntu ++ runs-on: ubuntu-20.04 ++ strategy: ++ matrix: ++ arch: [x64_64, aarch64, armv7, ppc64le] ++ steps: ++ - uses: actions/checkout@v2 ++ with: ++ arch: ${{ matrix.arch }} ++ - name: prepare ++ run: | ++ sudo apt-get update ++ sudo apt-get install -y build-essential sqlite3 ++ - name: build ++ run: | ++ autoreconf -vfi ++ ./configure --enable-all ++ make ++ sudo make install diff --git a/fc1dd37d422fc907416afd028514fff59b63ae12.patch b/fc1dd37d422fc907416afd028514fff59b63ae12.patch new file mode 100644 index 0000000..460d2c1 --- /dev/null +++ b/fc1dd37d422fc907416afd028514fff59b63ae12.patch @@ -0,0 +1,30 @@ +commit fc1dd37d422fc907416afd028514fff59b63ae12 +Author: DmNosachev +Date: Wed Jun 30 16:49:18 2021 +0300 + + labels/supermicro: added Supermicro B1DRi + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/labels/supermicro b/labels/supermicro +index 373de07..b924a32 100644 +--- a/labels/supermicro ++++ b/labels/supermicro +@@ -105,4 +105,14 @@ Vendor: Supermicro + P2-DIMMC1: 2.2.0; + P2-DIMMD1: 3.0.0; + P2-DIMME1: 3.1.0; +- P2-DIMMF1: 3.2.0; +\ No newline at end of file ++ P2-DIMMF1: 3.2.0; ++ ++ Model: B1DRi ++ P1_DIMMA1: 0.0.0; ++ P1_DIMMB1: 0.1.0; ++ P1_DIMMC1: 0.2.0; ++ P1_DIMMD1: 0.3.0; ++ P2_DIMME1: 1.0.0; ++ P2_DIMMF1: 1.1.0; ++ P2_DIMMG1: 1.2.0; ++ P2_DIMMH1: 1.3.0; +\ No newline at end of file diff --git a/fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch b/fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch new file mode 100644 index 0000000..a549df7 --- /dev/null +++ b/fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch @@ -0,0 +1,28 @@ +commit fcdffdcb28ece67ed78e3575a3dce45d9dd4f015 +Author: Mauro Carvalho Chehab +Date: Wed May 26 10:37:52 2021 +0200 + + rasdaemon.spec.in: Fix the description on this example file + + While this is used just to test if building it is OK, better + to keep the logs nice ;-) + + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 6ef223f..afa4359 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -61,10 +61,10 @@ rm INSTALL %{buildroot}/usr/include/*.h + %changelog + + * Wed May 26 2021 Mauro Carvalho Chehab 0.6.7-1 +-- Bump to version 0.6.5 with several fixes and additions ++- Bump to version 0.6.7 with several fixes and additions + + * Tue Jul 21 2020 Mauro Carvalho Chehab 0.6.6-1 +-- Bump to version 0.6.5 with several fixes, new hip08 events and memory prediction analysis ++- Bump to version 0.6.6 with several fixes, new hip08 events and memory prediction analysis + + * Wed Nov 20 2019 Mauro Carvalho Chehab 0.6.5-1 + - Bump to version 0.6.5 with several fixes and improves PCIe events record diff --git a/add_upstream_labels.patch b/labels.patch similarity index 72% rename from add_upstream_labels.patch rename to labels.patch index 70a04df..3eb072e 100644 --- a/add_upstream_labels.patch +++ b/labels.patch @@ -1,9 +1,40 @@ +Add labels directory from upstream + +Labels directory doesn't get exported by tarball releases. + +Signed-off-by: Aristeu Rozanski + --- - labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 152 insertions(+) + labels/asus | 20 +++++++ + labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + labels/supermicro | 70 ++++++++++++++++++++++++ + 3 files changed, 242 insertions(+) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/labels/dell 2020-02-20 11:53:39.574579258 -0500 ++++ rasdaemon-0.6.7/labels/asus 2022-02-08 15:44:53.563362010 -0500 +@@ -0,0 +1,20 @@ ++# RASDAEMON Motherboard DIMM labels Database file. ++# ++# Vendor-name and model-name are found from the program 'dmidecode' ++# labels are found from the silk screen on the motherboard. ++# ++#Vendor: ++# Product: ++# Model: ++#