diff --git a/0021-rasdaemon-add-rbtree-support-for-page-record.patch b/0021-rasdaemon-add-rbtree-support-for-page-record.patch new file mode 100644 index 0000000000000000000000000000000000000000..4a76a899096c15ab6614fa872d76f344b8970caa --- /dev/null +++ b/0021-rasdaemon-add-rbtree-support-for-page-record.patch @@ -0,0 +1,584 @@ +From 27794f4a5ff1453490bbcd805ad8e5b54516f015 Mon Sep 17 00:00:00 2001 +From: wuyun +Date: Sat, 20 Jun 2020 20:26:21 +0800 +Subject: [PATCH] rasdaemon: add rbtree support for page record + +commit 5fd96f457262052f7d06435af8a49689ffb6ffcf upstream + +The rbtree is very efficient for recording and querying fault page info. + +Signed-off-by: wuyun +Signed-off-by: lvying6 +Signed-off-by: Mauro Carvalho Chehab +Signed-off-by: Bixuan Cui +--- + rbtree.c | 384 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + rbtree.h | 165 +++++++++++++++++++++++++++ + 2 files changed, 549 insertions(+) + create mode 100644 rbtree.c + create mode 100644 rbtree.h + +diff --git a/rbtree.c b/rbtree.c +new file mode 100644 +index 0000000..d9b1bd4 +--- /dev/null ++++ b/rbtree.c +@@ -0,0 +1,384 @@ ++/* ++ Red Black Trees ++ (C) 1999 Andrea Arcangeli ++ (C) 2002 David Woodhouse ++ Taken from the Linux 2.6.30 source with some minor modificatons. ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ linux/lib/rbtree.c ++*/ ++ ++#include "rbtree.h" ++ ++static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) ++{ ++ struct rb_node *right = node->rb_right; ++ struct rb_node *parent = rb_parent(node); ++ ++ if ((node->rb_right = right->rb_left)) ++ rb_set_parent(right->rb_left, node); ++ right->rb_left = node; ++ ++ rb_set_parent(right, parent); ++ ++ if (parent) ++ { ++ if (node == parent->rb_left) ++ parent->rb_left = right; ++ else ++ parent->rb_right = right; ++ } ++ else ++ root->rb_node = right; ++ rb_set_parent(node, right); ++} ++ ++static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) ++{ ++ struct rb_node *left = node->rb_left; ++ struct rb_node *parent = rb_parent(node); ++ ++ if ((node->rb_left = left->rb_right)) ++ rb_set_parent(left->rb_right, node); ++ left->rb_right = node; ++ ++ rb_set_parent(left, parent); ++ ++ if (parent) ++ { ++ if (node == parent->rb_right) ++ parent->rb_right = left; ++ else ++ parent->rb_left = left; ++ } ++ else ++ root->rb_node = left; ++ rb_set_parent(node, left); ++} ++ ++void rb_insert_color(struct rb_node *node, struct rb_root *root) ++{ ++ struct rb_node *parent, *gparent; ++ ++ while ((parent = rb_parent(node)) && rb_is_red(parent)) ++ { ++ gparent = rb_parent(parent); ++ ++ if (parent == gparent->rb_left) ++ { ++ { ++ register struct rb_node *uncle = gparent->rb_right; ++ if (uncle && rb_is_red(uncle)) ++ { ++ rb_set_black(uncle); ++ rb_set_black(parent); ++ rb_set_red(gparent); ++ node = gparent; ++ continue; ++ } ++ } ++ ++ if (parent->rb_right == node) ++ { ++ struct rb_node *tmp; ++ __rb_rotate_left(parent, root); ++ tmp = parent; ++ parent = node; ++ node = tmp; ++ } ++ ++ rb_set_black(parent); ++ rb_set_red(gparent); ++ __rb_rotate_right(gparent, root); ++ } else { ++ { ++ struct rb_node *uncle = gparent->rb_left; ++ if (uncle && rb_is_red(uncle)) ++ { ++ rb_set_black(uncle); ++ rb_set_black(parent); ++ rb_set_red(gparent); ++ node = gparent; ++ continue; ++ } ++ } ++ ++ if (parent->rb_left == node) ++ { ++ struct rb_node *tmp; ++ __rb_rotate_right(parent, root); ++ tmp = parent; ++ parent = node; ++ node = tmp; ++ } ++ ++ rb_set_black(parent); ++ rb_set_red(gparent); ++ __rb_rotate_left(gparent, root); ++ } ++ } ++ ++ rb_set_black(root->rb_node); ++} ++ ++static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, ++ struct rb_root *root) ++{ ++ struct rb_node *other; ++ ++ while ((!node || rb_is_black(node)) && node != root->rb_node) ++ { ++ if (parent->rb_left == node) ++ { ++ other = parent->rb_right; ++ if (rb_is_red(other)) ++ { ++ rb_set_black(other); ++ rb_set_red(parent); ++ __rb_rotate_left(parent, root); ++ other = parent->rb_right; ++ } ++ if ((!other->rb_left || rb_is_black(other->rb_left)) && ++ (!other->rb_right || rb_is_black(other->rb_right))) ++ { ++ rb_set_red(other); ++ node = parent; ++ parent = rb_parent(node); ++ } ++ else ++ { ++ if (!other->rb_right || rb_is_black(other->rb_right)) ++ { ++ rb_set_black(other->rb_left); ++ rb_set_red(other); ++ __rb_rotate_right(other, root); ++ other = parent->rb_right; ++ } ++ rb_set_color(other, rb_color(parent)); ++ rb_set_black(parent); ++ rb_set_black(other->rb_right); ++ __rb_rotate_left(parent, root); ++ node = root->rb_node; ++ break; ++ } ++ } ++ else ++ { ++ other = parent->rb_left; ++ if (rb_is_red(other)) ++ { ++ rb_set_black(other); ++ rb_set_red(parent); ++ __rb_rotate_right(parent, root); ++ other = parent->rb_left; ++ } ++ if ((!other->rb_left || rb_is_black(other->rb_left)) && ++ (!other->rb_right || rb_is_black(other->rb_right))) ++ { ++ rb_set_red(other); ++ node = parent; ++ parent = rb_parent(node); ++ } ++ else ++ { ++ if (!other->rb_left || rb_is_black(other->rb_left)) ++ { ++ rb_set_black(other->rb_right); ++ rb_set_red(other); ++ __rb_rotate_left(other, root); ++ other = parent->rb_left; ++ } ++ rb_set_color(other, rb_color(parent)); ++ rb_set_black(parent); ++ rb_set_black(other->rb_left); ++ __rb_rotate_right(parent, root); ++ node = root->rb_node; ++ break; ++ } ++ } ++ } ++ if (node) ++ rb_set_black(node); ++} ++ ++void rb_erase(struct rb_node *node, struct rb_root *root) ++{ ++ struct rb_node *child, *parent; ++ int color; ++ ++ if (!node->rb_left) ++ child = node->rb_right; ++ else if (!node->rb_right) ++ child = node->rb_left; ++ else ++ { ++ struct rb_node *old = node, *left; ++ ++ node = node->rb_right; ++ while ((left = node->rb_left) != NULL) ++ node = left; ++ child = node->rb_right; ++ parent = rb_parent(node); ++ color = rb_color(node); ++ ++ if (child) ++ rb_set_parent(child, parent); ++ if (parent == old) { ++ parent->rb_right = child; ++ parent = node; ++ } else ++ parent->rb_left = child; ++ ++ node->rb_parent_color = old->rb_parent_color; ++ node->rb_right = old->rb_right; ++ node->rb_left = old->rb_left; ++ ++ if (rb_parent(old)) ++ { ++ if (rb_parent(old)->rb_left == old) ++ rb_parent(old)->rb_left = node; ++ else ++ rb_parent(old)->rb_right = node; ++ } else ++ root->rb_node = node; ++ ++ rb_set_parent(old->rb_left, node); ++ if (old->rb_right) ++ rb_set_parent(old->rb_right, node); ++ goto color; ++ } ++ ++ parent = rb_parent(node); ++ color = rb_color(node); ++ ++ if (child) ++ rb_set_parent(child, parent); ++ if (parent) ++ { ++ if (parent->rb_left == node) ++ parent->rb_left = child; ++ else ++ parent->rb_right = child; ++ } ++ else ++ root->rb_node = child; ++ ++ color: ++ if (color == RB_BLACK) ++ __rb_erase_color(child, parent, root); ++} ++ ++/* ++ * This function returns the first node (in sort order) of the tree. ++ */ ++struct rb_node *rb_first(const struct rb_root *root) ++{ ++ struct rb_node *n; ++ ++ n = root->rb_node; ++ if (!n) ++ return NULL; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++} ++ ++struct rb_node *rb_last(const struct rb_root *root) ++{ ++ struct rb_node *n; ++ ++ n = root->rb_node; ++ if (!n) ++ return NULL; ++ while (n->rb_right) ++ n = n->rb_right; ++ return n; ++} ++ ++struct rb_node *rb_next(const struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++ if (rb_parent(node) == node) ++ return NULL; ++ ++ /* If we have a right-hand child, go down and then left as far ++ as we can. */ ++ if (node->rb_right) { ++ node = node->rb_right; ++ while (node->rb_left) ++ node=node->rb_left; ++ return (struct rb_node *)node; ++ } ++ ++ /* No right-hand children. Everything down and left is ++ smaller than us, so any 'next' node must be in the general ++ direction of our parent. Go up the tree; any time the ++ ancestor is a right-hand child of its parent, keep going ++ up. First time it's a left-hand child of its parent, said ++ parent is our 'next' node. */ ++ while ((parent = rb_parent(node)) && node == parent->rb_right) ++ node = parent; ++ ++ return parent; ++} ++ ++struct rb_node *rb_prev(const struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++ if (rb_parent(node) == node) ++ return NULL; ++ ++ /* If we have a left-hand child, go down and then right as far ++ as we can. */ ++ if (node->rb_left) { ++ node = node->rb_left; ++ while (node->rb_right) ++ node=node->rb_right; ++ return (struct rb_node *)node; ++ } ++ ++ /* No left-hand children. Go up till we find an ancestor which ++ is a right-hand child of its parent */ ++ while ((parent = rb_parent(node)) && node == parent->rb_left) ++ node = parent; ++ ++ return parent; ++} ++ ++void rb_replace_node(struct rb_node *victim, struct rb_node *new, ++ struct rb_root *root) ++{ ++ struct rb_node *parent = rb_parent(victim); ++ ++ /* Set the surrounding nodes to point to the replacement */ ++ if (parent) { ++ if (victim == parent->rb_left) ++ parent->rb_left = new; ++ else ++ parent->rb_right = new; ++ } else { ++ root->rb_node = new; ++ } ++ if (victim->rb_left) ++ rb_set_parent(victim->rb_left, new); ++ if (victim->rb_right) ++ rb_set_parent(victim->rb_right, new); ++ ++ /* Copy the pointers/colour from the victim to the replacement */ ++ *new = *victim; ++} +diff --git a/rbtree.h b/rbtree.h +new file mode 100644 +index 0000000..a8a0459 +--- /dev/null ++++ b/rbtree.h +@@ -0,0 +1,165 @@ ++/* ++ Red Black Trees ++ (C) 1999 Andrea Arcangeli ++ Taken from the Linux 2.6.30 source. ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ linux/include/linux/rbtree.h ++ ++ To use rbtrees you'll have to implement your own insert and search cores. ++ This will avoid us to use callbacks and to drop drammatically performances. ++ I know it's not the cleaner way, but in C (not in C++) to get ++ performances and genericity... ++ ++ Some example of insert and search follows here. The search is a plain ++ normal search over an ordered tree. The insert instead must be implemented ++ int two steps: as first thing the code must insert the element in ++ order as a red leaf in the tree, then the support library function ++ rb_insert_color() must be called. Such function will do the ++ not trivial work to rebalance the rbtree if necessary. ++ ++----------------------------------------------------------------------- ++static inline struct page * rb_search_page_cache(struct inode * inode, ++ unsigned long offset) ++{ ++ struct rb_node * n = inode->i_rb_page_cache.rb_node; ++ struct page * page; ++ ++ while (n) ++ { ++ page = rb_entry(n, struct page, rb_page_cache); ++ ++ if (offset < page->offset) ++ n = n->rb_left; ++ else if (offset > page->offset) ++ n = n->rb_right; ++ else ++ return page; ++ } ++ return NULL; ++} ++ ++static inline struct page * __rb_insert_page_cache(struct inode * inode, ++ unsigned long offset, ++ struct rb_node * node) ++{ ++ struct rb_node ** p = &inode->i_rb_page_cache.rb_node; ++ struct rb_node * parent = NULL; ++ struct page * page; ++ ++ while (*p) ++ { ++ parent = *p; ++ page = rb_entry(parent, struct page, rb_page_cache); ++ ++ if (offset < page->offset) ++ p = &(*p)->rb_left; ++ else if (offset > page->offset) ++ p = &(*p)->rb_right; ++ else ++ return page; ++ } ++ ++ rb_link_node(node, parent, p); ++ ++ return NULL; ++} ++ ++static inline struct page * rb_insert_page_cache(struct inode * inode, ++ unsigned long offset, ++ struct rb_node * node) ++{ ++ struct page * ret; ++ if ((ret = __rb_insert_page_cache(inode, offset, node))) ++ goto out; ++ rb_insert_color(node, &inode->i_rb_page_cache); ++ out: ++ return ret; ++} ++----------------------------------------------------------------------- ++*/ ++ ++#ifndef _LINUX_RBTREE_H ++#define _LINUX_RBTREE_H ++ ++#include ++ ++#define container_of(ptr, type, member) ({ \ ++ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ ++ (type *)( (char *)__mptr - offsetof(type,member) );}) ++ ++struct rb_node ++{ ++ unsigned long rb_parent_color; ++#define RB_RED 0 ++#define RB_BLACK 1 ++ struct rb_node *rb_right; ++ struct rb_node *rb_left; ++} __attribute__((aligned(sizeof(long)))); ++ /* The alignment might seem pointless, but allegedly CRIS needs it */ ++ ++struct rb_root ++{ ++ struct rb_node *rb_node; ++}; ++ ++ ++#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) ++#define rb_color(r) ((r)->rb_parent_color & 1) ++#define rb_is_red(r) (!rb_color(r)) ++#define rb_is_black(r) rb_color(r) ++#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) ++#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) ++ ++static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) ++{ ++ rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; ++} ++static inline void rb_set_color(struct rb_node *rb, int color) ++{ ++ rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; ++} ++ ++#define RB_ROOT (struct rb_root) { NULL, } ++#define rb_entry(ptr, type, member) container_of(ptr, type, member) ++ ++#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) ++#define RB_EMPTY_NODE(node) (rb_parent(node) == node) ++#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) ++ ++extern void rb_insert_color(struct rb_node *, struct rb_root *); ++extern void rb_erase(struct rb_node *, struct rb_root *); ++ ++/* Find logical next and previous nodes in a tree */ ++extern struct rb_node *rb_next(const struct rb_node *); ++extern struct rb_node *rb_prev(const struct rb_node *); ++extern struct rb_node *rb_first(const struct rb_root *); ++extern struct rb_node *rb_last(const struct rb_root *); ++ ++/* Fast replacement of a single node without remove/rebalance/add/rebalance */ ++extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, ++ struct rb_root *root); ++ ++static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, ++ struct rb_node ** rb_link) ++{ ++ node->rb_parent_color = (unsigned long )parent; ++ node->rb_left = node->rb_right = NULL; ++ ++ *rb_link = node; ++} ++ ++#endif /* _LINUX_RBTREE_H */ +-- +1.8.3.1 + diff --git a/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch b/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch new file mode 100644 index 0000000000000000000000000000000000000000..531865f9bb7d68138fb61c07b50ce997fce24181 --- /dev/null +++ b/0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch @@ -0,0 +1,646 @@ +From c62d0466b0e69ac8c724c9d917000f18aa147aae Mon Sep 17 00:00:00 2001 +From: wuyun +Date: Sat, 20 Jun 2020 20:26:22 +0800 +Subject: [PATCH] rasdaemon: add support for memory Corrected Error predictive failure analysis + +commit 9ae6b70effb8adc9572debc800b8e16173f74bb8 upstream + +Memory Corrected Error was corrected by hardware. These errors do not +require immediate software actions, but are still reported for +accounting and predictive failure analysis. + +Based on statistical results, some actions can be taken to prevent +Corrected Error from evoluting to Uncorrected Error. + +Signed-off-by: wuyun +Signed-off-by: lvying6 +Signed-off-by: Mauro Carvalho Chehab +Signed-off-by: Bixuan Cui +--- + Makefile.am | 7 +- + configure.ac | 12 ++ + man/rasdaemon.1.in | 7 + + misc/rasdaemon.env | 29 ++++ + misc/rasdaemon.service.in | 1 + + misc/rasdaemon.spec.in | 4 +- + ras-events.c | 6 + + ras-mc-handler.c | 7 + + ras-page-isolation.c | 332 ++++++++++++++++++++++++++++++++++++++ + ras-page-isolation.h | 66 ++++++++ + 10 files changed, 468 insertions(+), 3 deletions(-) + create mode 100644 misc/rasdaemon.env + create mode 100644 ras-page-isolation.c + create mode 100644 ras-page-isolation.h + +diff --git a/Makefile.am b/Makefile.am +index fccdeba..dc30ae7 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -1,6 +1,6 @@ + ACLOCAL_AMFLAGS=-I m4 + SUBDIRS = libtrace util man +-SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in ++SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env + SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) + EXTRA_DIST = $(SYSTEMD_SERVICES_IN) + +@@ -51,13 +51,16 @@ endif + if WITH_HISI_NS_DECODE + rasdaemon_SOURCES += non-standard-hisi_hip07.c + endif ++if WITH_MEMORY_CE_PFA ++ rasdaemon_SOURCES += rbtree.c ras-page-isolation.c ++endif + + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a + + include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ +- ras-memory-failure-handler.h ++ ras-memory-failure-handler.h rbtree.h ras-page-isolation.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 8be33d9..1f95459 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -108,6 +108,17 @@ AS_IF([test "x$enable_hisi_ns_decode" = "xyes"], [ + ]) + AM_CONDITIONAL([WITH_HISI_NS_DECODE], [test x$enable_hisi_ns_decode = xyes]) + ++AC_ARG_ENABLE([memory_ce_pfa], ++ AS_HELP_STRING([--enable-memory-ce-pfa], [enable memory Corrected Error predictive failure analysis])) ++ ++AS_IF([test "x$enable_memory_ce_pfa" = "xyes"], [ ++ AC_DEFINE(HAVE_MEMORY_CE_PFA,1,"have memory corrected error predictive failure analysis") ++ AC_SUBST([WITH_MEMORY_CE_PFA]) ++]) ++AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes]) ++AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"]) ++ ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -138,4 +149,5 @@ compile time options summary + HIP07 SAS HW errors : $enable_hisi_ns_decode + ARM events : $enable_arm + Memory Failure : $USE_MEMORY_FAILURE ++ Memory CE PFA : $enable_memory_ce_pfa + EOF +diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in +index 834df16..833c8e1 100644 +--- a/man/rasdaemon.1.in ++++ b/man/rasdaemon.1.in +@@ -62,6 +62,13 @@ feature. + .BI "--version" + Print the program version and exit. + ++.SH CONFIG FILE ++ ++The \fBrasdaemon\fR program supports a config file to set rasdaemon systemd service ++environment variables. By default the config file is read from /etc/sysconfig/rasdaemon. ++ ++The general format is environmentname=value. ++ + .SH SEE ALSO + \fBras-mc-ctl\fR(8) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +new file mode 100644 +index 0000000..12fd766 +--- /dev/null ++++ b/misc/rasdaemon.env +@@ -0,0 +1,29 @@ ++# Page Isolation ++# Note: Run-time configuration is unsupported, service restart needed. ++# Note: this file should be installed at /etc/sysconfig/rasdaemon ++ ++# Specify the threshold of isolating buggy pages. ++# ++# Format: ++# [0-9]+[unit] ++# Notice: please make sure match this format, rasdaemon will use default value for exception input cases. ++# ++# Supported units: ++# PAGE_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour ++# PAGE_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none ++# ++# The two configs will only take no effect when PAGE_CE_ACTION is "off". ++PAGE_CE_REFRESH_CYCLE="24h" ++PAGE_CE_THRESHOLD="50" ++ ++# Specify the internal action in rasdaemon to exceeding a page error threshold. ++# ++# off no action ++# account only account errors ++# soft try to soft-offline page without killing any processes ++# This requires an uptodate kernel. Might not be successfull. ++# hard try to hard-offline page by killing processes ++# Requires an uptodate kernel. Might not be successfull. ++# soft-then-hard First try to soft offline, then try hard offlining. ++# Note: default offline choice is "soft". ++PAGE_CE_ACTION="soft" +diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in +index be9ad5a..e73a08a 100644 +--- a/misc/rasdaemon.service.in ++++ b/misc/rasdaemon.service.in +@@ -3,6 +3,7 @@ Description=RAS daemon to log the RAS events + After=syslog.target + + [Service] ++EnvironmentFile=/etc/sysconfig/rasdaemon + ExecStart=@sbindir@/rasdaemon -f -r + ExecStartPost=@sbindir@/rasdaemon --enable + ExecStop=@sbindir@/rasdaemon --disable +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 82fae30..f5faffe 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -36,12 +36,13 @@ an utility for reporting current error counts from the EDAC sysfs files. + %setup -q + + %build +-%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm ++%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-ce-pfa + + make %{?_smp_mflags} + + %install + make install DESTDIR=%{buildroot} ++install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} + install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service + install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service + rm INSTALL %{buildroot}/usr/include/*.h +@@ -54,6 +55,7 @@ rm INSTALL %{buildroot}/usr/include/*.h + %{_unitdir}/*.service + %{_sharedstatedir}/rasdaemon + %{_sysconfdir}/ras/dimm_labels.d ++%config(noreplace) %{_sysconfdir}/sysconfig/%{name} + + %changelog + +diff --git a/ras-events.c b/ras-events.c +index 27ac1ab..5113c32 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -36,6 +36,7 @@ + #include "ras-memory-failure-handler.h" + #include "ras-record.h" + #include "ras-logger.h" ++#include "ras-page-isolation.h" + + /* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never +@@ -673,6 +674,11 @@ int handle_ras_events(int record_events) + ras->page_size = page_size; + ras->record_events = record_events; + ++#ifdef HAVE_MEMORY_CE_PFA ++ /* FIXME: enable memory isolation unconditionally */ ++ ras_page_account_init(); ++#endif ++ + rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", + ras_mc_event_handler); + if (!rc) +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index deb7e05..42b05cd 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -23,6 +23,7 @@ + #include "ras-mc-handler.h" + #include "ras-record.h" + #include "ras-logger.h" ++#include "ras-page-isolation.h" + #include "ras-report.h" + + int ras_mc_event_handler(struct trace_seq *s, +@@ -183,6 +184,12 @@ int ras_mc_event_handler(struct trace_seq *s, + + ras_store_mc_event(ras, &ev); + ++#ifdef HAVE_MEMORY_CE_PFA ++ /* Account page corrected errors */ ++ if (!strcmp(ev.error_type, "Corrected")) ++ ras_record_page_error(ev.address, ev.error_count, now); ++#endif ++ + #ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_mc_event(ras, &ev); +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +new file mode 100644 +index 0000000..50e4406 +--- /dev/null ++++ b/ras-page-isolation.c +@@ -0,0 +1,332 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include "ras-logger.h" ++#include "ras-page-isolation.h" ++ ++#define PARSED_ENV_LEN 50 ++static const struct config threshold_units[] = { ++ { "m", 1000 }, ++ { "k", 1000 }, ++ { "", 1 }, ++ {} ++}; ++ ++static const struct config cycle_units[] = { ++ { "d", 24 }, ++ { "h", 60 }, ++ { "m", 60 }, ++ { "s", 1 }, ++ {} ++}; ++ ++static struct isolation threshold = { ++ .name = "PAGE_CE_THRESHOLD", ++ .units = threshold_units, ++ .env = "50", ++ .unit = "", ++}; ++ ++static struct isolation cycle = { ++ .name = "PAGE_CE_REFRESH_CYCLE", ++ .units = cycle_units, ++ .env = "24h", ++ .unit = "h", ++}; ++ ++static const char *kernel_offline[] = { ++ [OFFLINE_SOFT] = "/sys/devices/system/memory/soft_offline_page", ++ [OFFLINE_HARD] = "/sys/devices/system/memory/hard_offline_page", ++ [OFFLINE_SOFT_THEN_HARD] = "/sys/devices/system/memory/soft_offline_page", ++}; ++ ++static const struct config offline_choice[] = { ++ { "off", OFFLINE_OFF }, ++ { "account", OFFLINE_ACCOUNT }, ++ { "soft", OFFLINE_SOFT }, ++ { "hard", OFFLINE_HARD }, ++ { "soft-then-hard", OFFLINE_SOFT_THEN_HARD }, ++ {} ++}; ++ ++static const char *page_state[] = { ++ [PAGE_ONLINE] = "online", ++ [PAGE_OFFLINE] = "offlined", ++ [PAGE_OFFLINE_FAILED] = "offline-failed", ++}; ++ ++static enum otype offline = OFFLINE_SOFT; ++static struct rb_root page_records; ++ ++static void page_offline_init(void) ++{ ++ const char *env = "PAGE_CE_ACTION"; ++ char *choice = getenv(env); ++ const struct config *c = NULL; ++ int matched = 0; ++ ++ if (choice) { ++ for (c = offline_choice; c->name; c++) { ++ if (!strcasecmp(choice, c->name)) { ++ offline = c->val; ++ matched = 1; ++ break; ++ } ++ } ++ } ++ ++ if (!matched) ++ log(TERM, LOG_INFO, "Improper %s, set to default soft\n", env); ++ ++ if (offline > OFFLINE_ACCOUNT && access(kernel_offline[offline], W_OK)) { ++ log(TERM, LOG_INFO, "Kernel does not support page offline interface\n"); ++ offline = OFFLINE_ACCOUNT; ++ } ++ ++ log(TERM, LOG_INFO, "Page offline choice on Corrected Errors is %s\n", ++ offline_choice[offline].name); ++} ++ ++static void parse_isolation_env(struct isolation *config) ++{ ++ char *env = getenv(config->name); ++ char *unit = NULL; ++ const struct config *units = NULL; ++ int i, no_unit; ++ int valid = 0; ++ int unit_matched = 0; ++ unsigned long value, tmp; ++ ++ /* check if env is vaild */ ++ if (env && strlen(env)) { ++ /* All the character before unit must be digit */ ++ for (i = 0; i < strlen(env) - 1; i++) { ++ if (!isdigit(env[i])) ++ goto parse; ++ } ++ if (sscanf(env, "%lu", &value) < 1 || !value) ++ goto parse; ++ /* check if the unit is vaild */ ++ unit = env + strlen(env) - 1; ++ /* no unit, all the character are value character */ ++ if (isdigit(*unit)) { ++ valid = 1; ++ no_unit = 1; ++ goto parse; ++ } ++ for (units = config->units; units->name; units++) { ++ /* value character and unit character are both valid */ ++ if (!strcasecmp(unit, units->name)) { ++ valid = 1; ++ no_unit = 0; ++ break; ++ } ++ } ++ } ++ ++parse: ++ /* if invalid, use default env */ ++ if (valid) { ++ config->env = env; ++ if (!no_unit) ++ config->unit = unit; ++ } else { ++ log(TERM, LOG_INFO, "Improper %s, set to default %s.\n", ++ config->name, config->env); ++ } ++ ++ /* if env value string is greater than ulong_max, truncate the last digit */ ++ sscanf(config->env, "%lu", &value); ++ for (units = config->units; units->name; units++) { ++ if (!strcasecmp(config->unit, units->name)) ++ unit_matched = 1; ++ if (unit_matched) { ++ tmp = value; ++ value *= units->val; ++ if (tmp != 0 && value / tmp != units->val) ++ config->overflow = true; ++ } ++ } ++ config->val = value; ++ /* In order to output value and unit perfectly */ ++ config->unit = no_unit ? config->unit : ""; ++} ++ ++static void parse_env_string(struct isolation *config, char *str) ++{ ++ int i; ++ ++ if (config->overflow) { ++ /* when overflow, use basic unit */ ++ for (i = 0; config->units[i].name; i++) ; ++ sprintf(str, "%lu%s", config->val, config->units[i-1].name); ++ log(TERM, LOG_INFO, "%s is set overflow(%s), truncate it\n", ++ config->name, config->env); ++ } else { ++ sprintf(str, "%s%s", config->env, config->unit); ++ } ++} ++ ++static void page_isolation_init(void) ++{ ++ char threshold_string[PARSED_ENV_LEN]; ++ char cycle_string[PARSED_ENV_LEN]; ++ /** ++ * It's unnecessary to parse threshold configuration when offline ++ * choice is off. ++ */ ++ if (offline == OFFLINE_OFF) ++ return; ++ ++ parse_isolation_env(&threshold); ++ parse_isolation_env(&cycle); ++ parse_env_string(&threshold, threshold_string); ++ parse_env_string(&cycle, cycle_string); ++ log(TERM, LOG_INFO, "Threshold of memory Corrected Errors is %s / %s\n", ++ threshold_string, cycle_string); ++} ++ ++void ras_page_account_init(void) ++{ ++ page_offline_init(); ++ page_isolation_init(); ++} ++ ++static int do_page_offline(unsigned long long addr, enum otype type) ++{ ++ FILE *offline_file; ++ int err; ++ ++ offline_file = fopen(kernel_offline[type], "w"); ++ if (!offline_file) ++ return -1; ++ ++ fprintf(offline_file, "%#llx", addr); ++ err = ferror(offline_file) ? -1 : 0; ++ fclose(offline_file); ++ ++ return err; ++} ++ ++static void page_offline(struct page_record *pr) ++{ ++ unsigned long long addr = pr->addr; ++ int ret; ++ ++ /* Offlining page is not required */ ++ if (offline <= OFFLINE_ACCOUNT) ++ return; ++ ++ /* Ignore offlined pages */ ++ if (pr->offlined != PAGE_ONLINE) ++ return; ++ ++ /* Time to silence this noisy page */ ++ if (offline == OFFLINE_SOFT_THEN_HARD) { ++ ret = do_page_offline(addr, OFFLINE_SOFT); ++ if (ret < 0) ++ ret = do_page_offline(addr, OFFLINE_HARD); ++ } else { ++ ret = do_page_offline(addr, offline); ++ } ++ ++ pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; ++ ++ log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", ++ addr, page_state[pr->offlined]); ++} ++ ++static void page_record(struct page_record *pr, unsigned count, time_t time) ++{ ++ unsigned long period = time - pr->start; ++ unsigned long tolerate; ++ ++ if (period >= cycle.val) { ++ /** ++ * Since we don't refresh automatically, it is possible that the period ++ * between two occurences will be longer than the pre-configured refresh cycle. ++ * In this case, we tolerate the frequency of the whole period up to ++ * the pre-configured threshold. ++ */ ++ tolerate = (period / (double)cycle.val) * threshold.val; ++ pr->count -= (tolerate > pr->count) ? pr->count : tolerate; ++ pr->start = time; ++ pr->excess = 0; ++ } ++ ++ pr->count += count; ++ if (pr->count >= threshold.val) { ++ log(TERM, LOG_INFO, "Corrected Errors at %#llx exceeded threshold\n", pr->addr); ++ ++ /** ++ * Backup ce count of current cycle to enable next round, which actually ++ * should never happen if we can disable overflow completely in the same ++ * time unit (but sadly we can't). ++ */ ++ pr->excess += pr->count; ++ pr->count = 0; ++ page_offline(pr); ++ } ++} ++ ++static struct page_record *page_lookup_insert(unsigned long long addr) ++{ ++ struct rb_node **entry = &page_records.rb_node; ++ struct rb_node *parent = NULL; ++ struct page_record *pr = NULL, *find = NULL; ++ ++ while (*entry) { ++ parent = *entry; ++ pr = rb_entry(parent, struct page_record, entry); ++ if (addr == pr->addr) { ++ return pr; ++ } else if (addr < pr->addr) { ++ entry = &(*entry)->rb_left; ++ } else { ++ entry = &(*entry)->rb_right; ++ } ++ } ++ ++ find = calloc(1, sizeof(struct page_record)); ++ if (!find) { ++ log(TERM, LOG_ERR, "No memory for page records\n"); ++ return NULL; ++ } ++ ++ find->addr = addr; ++ rb_link_node(&find->entry, parent, entry); ++ rb_insert_color(&find->entry, &page_records); ++ ++ return find; ++} ++ ++void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) ++{ ++ struct page_record *pr = NULL; ++ ++ if (offline == OFFLINE_OFF) ++ return; ++ ++ pr = page_lookup_insert(addr & PAGE_MASK); ++ if (pr) { ++ if (!pr->start) ++ pr->start = time; ++ page_record(pr, count, time); ++ } ++} +diff --git a/ras-page-isolation.h b/ras-page-isolation.h +new file mode 100644 +index 0000000..3d03cef +--- /dev/null ++++ b/ras-page-isolation.h +@@ -0,0 +1,66 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++*/ ++ ++#ifndef __RAS_PAGE_ISOLATION_H ++#define __RAS_PAGE_ISOLATION_H ++ ++#include ++#include ++#include "rbtree.h" ++ ++#define PAGE_SHIFT 12 ++#define PAGE_SIZE (1 << PAGE_SHIFT) ++#define PAGE_MASK (~(PAGE_SIZE-1)) ++ ++struct config { ++ char *name; ++ unsigned long val; ++}; ++ ++enum otype { ++ OFFLINE_OFF, ++ OFFLINE_ACCOUNT, ++ OFFLINE_SOFT, ++ OFFLINE_HARD, ++ OFFLINE_SOFT_THEN_HARD, ++}; ++ ++enum pstate { ++ PAGE_ONLINE, ++ PAGE_OFFLINE, ++ PAGE_OFFLINE_FAILED, ++}; ++ ++struct page_record { ++ struct rb_node entry; ++ unsigned long long addr; ++ time_t start; ++ enum pstate offlined; ++ unsigned long count; ++ unsigned long excess; ++}; ++ ++struct isolation { ++ char *name; ++ char *env; ++ const struct config *units; ++ unsigned long val; ++ bool overflow; ++ char *unit; ++}; ++ ++void ras_page_account_init(void); ++void ras_record_page_error(unsigned long long addr, unsigned count, time_t time); ++ ++#endif +-- +2.27.0 + diff --git a/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch b/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch new file mode 100644 index 0000000000000000000000000000000000000000..9132d67fba2e2dc2f869b09943f4c298c6207c6f --- /dev/null +++ b/0023-rasdaemon-Add-notification-support-when-page-goes-of.patch @@ -0,0 +1,259 @@ +From 07c3c72d18e5c7da2109b5afa918966733039f13 Mon Sep 17 00:00:00 2001 +From: Bixuan Cui +Date: Sun, 5 Jun 2022 02:10:24 +0800 +Subject: [PATCH] rasdaemon: Add notification support when page goes offline for Memory Corrected Error + +When the page goes offline, it may affect the user's processes. +The user needs to do some special actions (such as restarting the +process) before or after going offline. + +So add page-ce-offline-pre-notice and page-ce-offline-post-notice +to env file of rasdaemon for notifying the user when doing page +offline. + +Signed-off-by: Bixuan Cui +--- + Makefile.am | 2 +- + misc/notices/page-ce-offline-post-notice | 17 +++++ + misc/notices/page-ce-offline-pre-notice | 17 +++++ + misc/rasdaemon.env | 4 ++ + misc/rasdaemon.spec.in | 3 + + ras-page-isolation.c | 90 ++++++++++++++++++++++++ + 6 files changed, 132 insertions(+), 1 deletion(-) + create mode 100755 misc/notices/page-ce-offline-post-notice + create mode 100755 misc/notices/page-ce-offline-pre-notice + +diff --git a/Makefile.am b/Makefile.am +index de76301..701b120 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -1,6 +1,6 @@ + ACLOCAL_AMFLAGS=-I m4 + SUBDIRS = libtrace util man +-SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env ++SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in misc/rasdaemon.env misc/notices + SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) + EXTRA_DIST = $(SYSTEMD_SERVICES_IN) + +diff --git a/misc/notices/page-ce-offline-post-notice b/misc/notices/page-ce-offline-post-notice +new file mode 100755 +index 0000000..d78b1b0 +--- /dev/null ++++ b/misc/notices/page-ce-offline-post-notice +@@ -0,0 +1,17 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon after a page goes offline. ++ ++cd `dirname $0` ++ ++[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1 ++ ++if [ -d page-ce-offline-post-notice.extern ] ++then ++ ls page-ce-offline-post-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1 ++ done ++fi ++ ++exit 0 +diff --git a/misc/notices/page-ce-offline-pre-notice b/misc/notices/page-ce-offline-pre-notice +new file mode 100755 +index 0000000..d1038a3 +--- /dev/null ++++ b/misc/notices/page-ce-offline-pre-notice +@@ -0,0 +1,17 @@ ++#!/bin/sh ++# This shell script can be executed by rasdaemon before a page goes offline. ++ ++cd `dirname $0` ++ ++[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1 ++ ++if [ -d page-ce-offline-pre-notice.extern ] ++then ++ ls page-ce-offline-pre-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1 ++ done ++fi ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 12fd766..713875a 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -27,3 +27,7 @@ PAGE_CE_THRESHOLD="50" + # soft-then-hard First try to soft offline, then try hard offlining. + # Note: default offline choice is "soft". + PAGE_CE_ACTION="soft" ++ ++# Notices script when doing memory offline ++PAGE_CE_OFFLINE_PRE_NOTICE="page-ce-offline-pre-notice" ++PAGE_CE_OFFLINE_POST_NOTICE="page-ce-offline-post-notice" +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index eff9794..f690575 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -45,6 +45,8 @@ make install DESTDIR=%{buildroot} + install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} + install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service + install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service ++install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ ++install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -56,6 +58,7 @@ rm INSTALL %{buildroot}/usr/include/*.h + %{_sharedstatedir}/rasdaemon + %{_sysconfdir}/ras/dimm_labels.d + %config(noreplace) %{_sysconfdir}/sysconfig/%{name} ++%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* + + %changelog + +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 50e4406..f4f3bc1 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -17,9 +17,13 @@ + #include + #include + #include ++#include ++#include ++#include + #include "ras-logger.h" + #include "ras-page-isolation.h" + ++#define MAX_PATH_LEN 64 + #define PARSED_ENV_LEN 50 + static const struct config threshold_units[] = { + { "m", 1000 }, +@@ -73,6 +77,8 @@ static const char *page_state[] = { + + static enum otype offline = OFFLINE_SOFT; + static struct rb_root page_records; ++static char pre_notice[MAX_PATH_LEN]; ++static char post_notice[MAX_PATH_LEN]; + + static void page_offline_init(void) + { +@@ -202,16 +208,94 @@ static void page_isolation_init(void) + threshold_string, cycle_string); + } + ++static void page_notice_init(void) ++{ ++ char *notice_root = "/etc/rasdaemon_notices"; ++ char *pre_re = getenv("PAGE_CE_OFFLINE_PRE_NOTICE"); ++ char *post_re = getenv("PAGE_CE_OFFLINE_POST_NOTICE"); ++ ++ if (offline <= OFFLINE_ACCOUNT) ++ return; ++ ++ snprintf(pre_notice, sizeof(pre_notice), "%s/%s", notice_root, pre_re); ++ if (access(pre_notice, R_OK|X_OK) < 0) ++ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", pre_notice); ++ ++ snprintf(post_notice, sizeof(post_notice), "%s/%s", notice_root, post_re); ++ if (access(post_notice, R_OK|X_OK) < 0) ++ log(TERM, LOG_ERR, "cannot access page notice '%s'\n", post_notice); ++} ++ + void ras_page_account_init(void) + { + page_offline_init(); + page_isolation_init(); ++ page_notice_init(); ++} ++ ++static void finish_child(pid_t child, int status) ++{ ++ if (WIFEXITED(status) && WEXITSTATUS(status)) { ++ log(TERM, LOG_INFO, "notice exited with status %d\n", WEXITSTATUS(status)); ++ } else if (WIFSIGNALED(status)) { ++ log(TERM, LOG_INFO,"notice died with signal %s\n", strsignal(WTERMSIG(status))); ++ } ++ ++ return; ++} ++ ++static void __run_notice(char *argv[], char **env) ++{ ++ pid_t child; ++ int status; ++ ++ child = fork(); ++ if (child < 0) { ++ log(TERM, LOG_ERR, "Cannot create process for offline notice"); ++ return; ++ } ++ if (child == 0) { ++ execve(argv[0], argv, env); ++ _exit(127); ++ } ++ else { ++ waitpid(child, &status, 0); ++ finish_child(child, status); ++ } ++} ++ ++static void run_notice(char *argv[]) ++{ ++ int MAX_ENV = 20; ++ char *env[MAX_ENV]; ++ int ei = 0; ++ int i; ++ ++ asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin"); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ __run_notice(argv, env); ++ ++ for (i = 0; i < ei; i++) ++ free(env[i]); + } + + static int do_page_offline(unsigned long long addr, enum otype type) + { + FILE *offline_file; + int err; ++ char *args; ++ char *argv[] = { ++ NULL, ++ NULL, ++ NULL, ++ }; ++ ++ asprintf(&args, "%llu", addr); ++ argv[0] = (char*)&pre_notice; ++ argv[1] = args; ++ run_notice(argv); + + offline_file = fopen(kernel_offline[type], "w"); + if (!offline_file) +@@ -221,6 +305,11 @@ static int do_page_offline(unsigned long long addr, enum otype type) + err = ferror(offline_file) ? -1 : 0; + fclose(offline_file); + ++ argv[0] = (char*)&post_notice; ++ run_notice(argv); ++ ++ free(args); ++ + return err; + } + +@@ -329,4 +418,5 @@ void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) + pr->start = time; + page_record(pr, count, time); + } ++ + } +-- +2.27.0 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 49f41b92a227e6fabea69eba27b2ce018dd0f287..30ed82dce93b2e0e71ec85dac8cb5a2a3dc2d196 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.1 +%define anolis_release .0.2 Name: rasdaemon Version: 0.6.1 Release: 12%{anolis_release}%{?dist} @@ -45,6 +45,9 @@ Patch17: 16d929b024c31d54a7f8a72eab094376c7be27f5.patch Patch18: b497a3d6a39d402c41065e9284d49114b97e3bfe.patch Patch19: ce6e7864f11f709c4f803828fbc8e507d115d03b.patch Patch20: a8c776ed94f68ae31d7b5f74e19545698898c13c.patch +Patch21: 0021-rasdaemon-add-rbtree-support-for-page-record.patch +Patch22: 0022-rasdaemon-add-support-for-memory-Corrected-Error-pre.patch +Patch23: 0023-rasdaemon-Add-notification-support-when-page-goes-of.patch # Begin: Anolis customized patches # Backport from fc32 to fix FTBFS on gcc10 @@ -83,6 +86,9 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch18 -p1 %patch19 -p1 %patch20 -p1 +%patch21 -p1 +%patch22 -p1 +%patch23 -p1 %patch1001 -p1 @@ -92,7 +98,7 @@ autoreconf -vfi %build %ifarch %{arm} aarch64 -%configure --enable-aer --enable-sqlite3 --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm +%configure --enable-aer --enable-sqlite3 --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-ce-pfa %else %configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-memory-failure %endif @@ -103,6 +109,11 @@ make install DESTDIR=%{buildroot} install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service install -D -p -m 0655 labels/* %{buildroot}%{_sysconfdir}/ras/dimm_labels.d +%ifarch %{arm} aarch64 +install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} +install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ +install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ +%endif rm INSTALL %{buildroot}/usr/include/*.h %files @@ -113,8 +124,16 @@ rm INSTALL %{buildroot}/usr/include/*.h %{_unitdir}/*.service %{_sharedstatedir}/rasdaemon %{_sysconfdir}/ras/dimm_labels.d +%ifarch %{arm} aarch64 +%config(noreplace) %{_sysconfdir}/sysconfig/%{name} +%config(noreplace) %{_sysconfdir}/rasdaemon_notices/* +%endif %changelog +* Tue Jun 28 2022 Bixuan Cui - 0.6.1-12.0.2 +- rasdaemon: add support for memory Corrected Error predictive failure analysis +- rasdaemon: add notification support when page goes offline for Memory Corrected Error + * Fri Apr 22 2022 Weitao Zhou - 0.6.1-12.0.1 - use extern in header files when declaring global variables for compatible gcc10 build