From 016760a37b2b330fcb74c04bc7fd4a54e2730bb0 Mon Sep 17 00:00:00 2001
From: zouzhimin <zouzhimin@kylinos.cn>
Date: Sun, 28 Apr 2024 23:02:01 +0800
Subject: [PATCH] Fix: tools: crm_mon segfaults when fencer connection is lost

(cherry picked from commit 41b01acb16af8352d48b3866988286dcaf0474f8)
---
 ...aults-when-fencer-connection-is-lost.patch | 84 +++++++++++++++++++
 pacemaker.spec                                |  6 +-
 2 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 Fix-tools-crm_mon-segfaults-when-fencer-connection-is-lost.patch

diff --git a/Fix-tools-crm_mon-segfaults-when-fencer-connection-is-lost.patch b/Fix-tools-crm_mon-segfaults-when-fencer-connection-is-lost.patch
new file mode 100644
index 0000000..4d9c536
--- /dev/null
+++ b/Fix-tools-crm_mon-segfaults-when-fencer-connection-is-lost.patch
@@ -0,0 +1,84 @@
+From 401f5d971f12db7792971aeec3aaba9f52d67626 Mon Sep 17 00:00:00 2001
+From: Reid Wahl <nrwahl@protonmail.com>
+Date: Thu, 18 Jan 2024 00:11:17 -0800
+Subject: [PATCH] Fix: tools: crm_mon segfaults when fencer connection is lost
+
+This is easiest to observe when Pacemaker is stopping.
+
+When crm_mon is running in interactive mode (the default) and the
+cluster is stopped, crm_mon crashes with a segmentation fault. This is a
+regression that was introduced in Pacemaker 2.1.0 by commit bc91cc5.
+However, for some reason the crash doesn't happen on all platforms. In
+particular, I can reproduce the crash on Fedora 38 and 39, but not on
+RHEL 9.3 or Fedora 37. This is independent of the Pacemaker version.
+
+The cause is a use-after-free. In detail, it is as follows:
+1. crm_mon registers a notification via its stonith API client for
+   disconnect events. This notification will call either
+   mon_st_callback_event() or mon_st_callback_display(), depending on
+   the CLI options. Both of these callbacks call
+   mon_cib_connection_destroy() for disconnect notifications, so it
+   doesn't matter which one is used.
+2. When the fencer connection is lost, the mainloop calls the stonith
+   API client's destroy callback (stonith_connection_destroy()).
+3. stonith_connection_destroy() sets the state to stonith_disconnected
+   and calls foreach_notify_entry(..., stonith_send_notification, blob),
+   where blob contains a disconnect notification.
+4. foreach_notify_entry() loops over all the registered notify entries,
+   calling stonith_send_notification(entry, blob) for each notify entry.
+5. For each notify client that's subscribed to disconnect notifications,
+   stonith_send_notification() calls the registered callback function.
+6. Based on the registration in step (1), stonith_send_notification()
+   synchronously calls mon_st_callback_event()/display() for crm_mon.
+7. mon_st_callback_event()/display() calls mon_cib_connection_destroy().
+8. mon_cib_connection_destroy() calls stonith_api_delete(), which frees
+   the stonith API client and its members, including the notification
+   table.
+9. Control returns to stonith_send_notification() and then back to
+   foreach_notify_entry().
+10. foreach_notify_entry() moves to the next entry in the list. But the
+    entire list was freed in step (8). So when it tries to access a
+    member of one of the entries, we get a segmentation fault.
+
+Commit bc91cc5 introduced the regression by deleting the stonith API
+client in mon_cib_connection_destroy(). Prior to that,
+mon_cib_connection_destroy() only disconnected the client and marked its
+notify entries for removal.
+
+I audited the other uses of stonith_api_delete() in crm_mon and
+elsewhere, and I believe they're safe in the sense that they're never
+called while we're processing stonith notify callbacks. A function
+should never be allowed to call stonith_api_delete() if the stonith API
+client might be sending out notifications. If there are more
+notifications in the table, attempts to access them will be a
+use-after-free.
+
+Fixes T751
+
+Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
+---
+ tools/crm_mon.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/tools/crm_mon.c b/tools/crm_mon.c
+index 7789bfebf..19a2ead89 100644
+--- a/tools/crm_mon.c
++++ b/tools/crm_mon.c
+@@ -854,8 +854,12 @@ mon_cib_connection_destroy(gpointer user_data)
+     /* the client API won't properly reconnect notifications if they are still
+      * in the table - so remove them
+      */
+-    stonith_api_delete(st);
+-    st = NULL;
++    if (st != NULL) {
++        if (st->state != stonith_disconnected) {
++            st->cmds->disconnect(st);
++        }
++        st->cmds->remove_notification(st, NULL);
++    }
+ 
+     if (cib) {
+         cib->cmds->signoff(cib);
+-- 
+2.25.1
+
diff --git a/pacemaker.spec b/pacemaker.spec
index 071df68..40f57cc 100644
--- a/pacemaker.spec
+++ b/pacemaker.spec
@@ -17,7 +17,7 @@
 ## can be incremented to build packages reliably considered "newer"
 ## than previously built packages with the same pcmkversion)
 %global pcmkversion 2.1.7
-%global specversion 9
+%global specversion 10
 
 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
 %global commit 0f7f88312f7a1ccedee60bf768aba79ee13d41e0
@@ -155,6 +155,7 @@ Patch2:	       Doc-HealthSMART-fix-the-description-of-temp_lower.patch
 Patch3:        002-schema-transfer.patch
 Patch4:        Improve-pacemaker-attrd-cache-management-and-logging.patch
 Patch5:        Fix-cibsecret-Use-ps-axww-to-avoid-truncating-issue.patch
+Patch6:        Fix-tools-crm_mon-segfaults-when-fencer-connection-is-lost.patch
 
 Requires:      resource-agents
 Requires:      %{pkgname_pcmk_libs} = %{version}-%{release}
@@ -762,6 +763,9 @@ exit 0
 %license %{nagios_name}-%{nagios_hash}/COPYING
 
 %changelog
+* Mon Apr 29 2024 zouzhimin <zouzhimin@kylinos.cn> - 2.1.7-10
+- Fix: tools: crm_mon segfaults when fencer connection is lost
+
 * Sun Apr 28 2024 zouzhimin <zouzhimin@kylinos.cn> - 2.1.7-9
 - Fix: cibsecret: Use 'ps axww' to avoid truncating issue
 
-- 
Gitee