From d708bdf82bdb1273a32cd02bacd38bd3966b8dfe Mon Sep 17 00:00:00 2001 From: Lv Ying Date: Wed, 31 Mar 2021 10:49:55 -0700 Subject: [PATCH] rasdaemon: backport bugfix patches from community 1. ras-page-isolation: do_page_offline always considers page offline was successful https://github.com/mchehab/rasdaemon/commit/e4d27840e173491ab29c2d97017da9344e2c2526 2. ras-page-isolation: page which is PAGE_OFFLINE_FAILED can be offlined again https://github.com/mchehab/rasdaemon/commit/c329012ce4b44af08217f2a8f2b3b9b1b4b1c0d3 --- ...on-do_page_offline-always-considers-.patch | 104 ++++++++++++++++++ ...on-page-which-is-PAGE_OFFLINE_FAILED.patch | 44 ++++++++ rasdaemon.spec | 9 +- 3 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch create mode 100644 backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch diff --git a/backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch b/backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch new file mode 100644 index 0000000..63ebacd --- /dev/null +++ b/backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch @@ -0,0 +1,104 @@ +From e4d27840e173491ab29c2d97017da9344e2c2526 Mon Sep 17 00:00:00 2001 +From: lvying +Date: Sat, 31 Oct 2020 17:57:14 +0800 +Subject: [PATCH 1/2] ras-page-isolation: do_page_offline always considers page + offline was successful + +do_page_offline always consider page offline was successful even if +kernel soft/hard offline page failed. + +Calling rasdaemon with: + + /etc/sysconfig/rasdaemon PAGE_CE_THRESHOLD="1" + +i.e when a page's address occurs Corrected Error, rasdaemon should +trigger this page soft offline. + +However, after adding a livepatch into kernel's +store_soft_offline_page to observe this function's return value, +when injecting a CE into address 0x3f7ec30000, the Kernel +lot reports: + + soft_offline: 0x3f7ec30: unknown non LRU page type ffffe0000000000 () + [store_soft_offline_page]return from soft_offline_page: -5 + +While rasdaemon log reports: + + rasdaemon[73711]: cpu 00:rasdaemon: Corrected Errors at 0x3f7ec30000 exceed threshold + rasdaemon[73711]: rasdaemon: Result of offlining page at 0x3f7ec30000: offlined + +using strace to record rasdaemon's system call, it reports: + + strace -p 73711 + openat(AT_FDCWD, "/sys/devices/system/memory/soft_offline_page", + O_WRONLY|O_CREAT|O_TRUNC, 0666) = 28 + fstat(28, {st_mode=S_IFREG|0200, st_size=4096, ...}) = 0 + write(28, "0x3f7ec30000", 12) = -1 EIO (Input/output error) + close(28) = 0 + +So, kernel actually soft offline pfn 0x3f7ec30 failed and +store_soft_offline_page returned -EIO. However, rasdaemon always +considers the page offline to be successful. + +According to strace display, ferror was unable of detecting the +failure of the write syscall. + +This patch changes fopen-fprintf-ferror-fclose process to use +the lower I/O level, by using instead open-write-close, which +can detect such syscall failure. + +Signed-off-by: lvying +Signed-off-by: Mauro Carvalho Chehab +--- + ras-page-isolation.c | 25 ++++++++++++++++--------- + 1 file changed, 16 insertions(+), 9 deletions(-) + +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 50e4406..dc07545 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -17,6 +17,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include "ras-logger.h" + #include "ras-page-isolation.h" + +@@ -210,18 +213,22 @@ void ras_page_account_init(void) + + static int do_page_offline(unsigned long long addr, enum otype type) + { +- FILE *offline_file; +- int err; ++ int fd, rc; ++ char buf[20]; + +- offline_file = fopen(kernel_offline[type], "w"); +- if (!offline_file) ++ fd = open(kernel_offline[type], O_WRONLY); ++ if (fd == -1) { ++ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, kernel_offline[type]); + return -1; ++ } + +- fprintf(offline_file, "%#llx", addr); +- err = ferror(offline_file) ? -1 : 0; +- fclose(offline_file); +- +- return err; ++ sprintf(buf, "%#llx", addr); ++ rc = write(fd, buf, strlen(buf)); ++ if (rc < 0) { ++ log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno); ++ } ++ close(fd); ++ return rc; + } + + static void page_offline(struct page_record *pr) +-- +2.18.4 + diff --git a/backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch b/backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch new file mode 100644 index 0000000..724dc9f --- /dev/null +++ b/backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch @@ -0,0 +1,44 @@ +From c329012ce4b44af08217f2a8f2b3b9b1b4b1c0d3 Mon Sep 17 00:00:00 2001 +From: lvying6 +Date: Sat, 31 Oct 2020 17:57:15 +0800 +Subject: [PATCH 2/2] ras-page-isolation: page which is PAGE_OFFLINE_FAILED can + be offlined again + +OS may fail to offline page at the previous time. After some time, +this page's state changed, and the page can be offlined by OS. +At this time, Correctable errors on this page reached the threshold. +Rasdaemon should trigger to offline this page again. + +Signed-off-by: lvying6 +Signed-off-by: Mauro Carvalho Chehab +--- + ras-page-isolation.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index dc07545..fd7bd70 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -237,12 +237,17 @@ static void page_offline(struct page_record *pr) + int ret; + + /* Offlining page is not required */ +- if (offline <= OFFLINE_ACCOUNT) ++ if (offline <= OFFLINE_ACCOUNT) { ++ log(TERM, LOG_INFO, "PAGE_CE_ACTION=%s, ignore to offline page at %#llx\n", ++ offline_choice[offline].name, addr); + return; ++ } + + /* Ignore offlined pages */ +- if (pr->offlined != PAGE_ONLINE) ++ if (pr->offlined == PAGE_OFFLINE) { ++ log(TERM, LOG_INFO, "page at %#llx is already offlined, ignore\n", addr); + return; ++ } + + /* Time to silence this noisy page */ + if (offline == OFFLINE_SOFT_THEN_HARD) { +-- +2.18.4 + diff --git a/rasdaemon.spec b/rasdaemon.spec index 7142a16..1c4c4a0 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.6 -Release: 2 +Release: 3 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -22,6 +22,8 @@ Requires(postun): systemd Patch1: bugfix-ras-events-memory-leak.patch Patch2: bugfix-rasdaemon-wait-for-file-access.patch Patch3: bugfix-fix-fd-check.patch +Patch4: backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch +Patch5: backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch %description The rasdaemon program is a daemon which monitors the platform @@ -68,6 +70,11 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Wed Mar 31 2021 Lv Ying - 0.6.6-3 +- backport bugfix patches from community: + 1. ras-page-isolation: do_page_offline always considers page offline was successful + 2. ras-page-isolation: page which is PAGE_OFFLINE_FAILED can be offlined again + * Fri Sep 25 2020 openEuler Buildteam - 0.6.6-2 - Update software source URL -- Gitee