From 7291a5c0ffdc3f37c7a496a551df8093d6836609 Mon Sep 17 00:00:00 2001 From: bizhiyuan Date: Thu, 31 Aug 2023 01:06:11 +0800 Subject: [PATCH] Refactor: fencer: sleep 1s between reconnects (cherry picked from commit 0ef3fd0936d149b148509e74ce081aae34235579) --- ...r-fencer-sleep-1s-between-reconnects.patch | 185 ++++++++++++++++++ pacemaker.spec | 7 +- 2 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 Refactor-fencer-sleep-1s-between-reconnects.patch diff --git a/Refactor-fencer-sleep-1s-between-reconnects.patch b/Refactor-fencer-sleep-1s-between-reconnects.patch new file mode 100644 index 0000000..86fecd6 --- /dev/null +++ b/Refactor-fencer-sleep-1s-between-reconnects.patch @@ -0,0 +1,185 @@ +From 55b5b78c69089fd418c590eb265eef2f7b82d689 Mon Sep 17 00:00:00 2001 +From: bizhiyuan +Date: Thu, 31 Aug 2023 00:50:44 +0800 +Subject: [PATCH] Refactor: fencer: sleep 1s between reconnects + +--- + daemons/controld/controld_control.c | 5 ++- + daemons/controld/controld_fencing.c | 65 ++++++++++++++--------------- + daemons/controld/controld_fencing.h | 2 +- + 3 files changed, 36 insertions(+), 36 deletions(-) + +diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c +index ffc62a0..48efdd5 100644 +--- a/daemons/controld/controld_control.c ++++ b/daemons/controld/controld_control.c +@@ -504,8 +504,9 @@ do_started(long long action, + } else { + crm_notice("Pacemaker controller successfully started and accepting connections"); + } +- controld_trigger_fencer_connect(); +- ++ controld_set_fsa_input_flags(R_ST_REQUIRED); ++ controld_timer_fencer_connect(GINT_TO_POINTER(TRUE)); ++ + controld_clear_fsa_input_flags(R_STARTING); + register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); + } +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 89cb61f..8f571b0 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -391,7 +391,7 @@ execute_stonith_cleanup(void) + */ + + static stonith_t *stonith_api = NULL; +-static crm_trigger_t *stonith_reconnect = NULL; ++static mainloop_timer_t *controld_fencer_connect_timer = NULL; + static char *te_client_id = NULL; + + static gboolean +@@ -448,8 +448,9 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) + + if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) { + crm_crit("Fencing daemon connection failed"); +- mainloop_set_trigger(stonith_reconnect); +- ++ if (!mainloop_timer_running(controld_fencer_connect_timer)) { ++ mainloop_timer_start(controld_fencer_connect_timer); ++ } + } else { + crm_info("Fencing daemon disconnected"); + } +@@ -647,14 +648,14 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) + /*! + * \brief Connect to fencer + * +- * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop ++ * \param[in] user_data If NULL, retry failures now, otherwise retry in mainloop timer + * +- * \return TRUE ++ * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry + * \note If user_data is NULL, this will wait 2s between attempts, for up to + * 30 attempts, meaning the controller could be blocked as long as 58s. + */ +-static gboolean +-te_connect_stonith(gpointer user_data) ++gboolean ++controld_timer_fencer_connect(gpointer user_data) + { + int rc = pcmk_ok; + +@@ -662,13 +663,13 @@ te_connect_stonith(gpointer user_data) + stonith_api = stonith_api_new(); + if (stonith_api == NULL) { + crm_err("Could not connect to fencer: API memory allocation failed"); +- return TRUE; ++ return G_SOURCE_REMOVE; + } + } + + if (stonith_api->state != stonith_disconnected) { + crm_trace("Already connected to fencer, no need to retry"); +- return TRUE; ++ return G_SOURCE_REMOVE; + } + + if (user_data == NULL) { +@@ -681,17 +682,31 @@ te_connect_stonith(gpointer user_data) + } else { + // Non-blocking (retry failures later in main loop) + rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); ++ ++ ++ if (controld_fencer_connect_timer == NULL) { ++ controld_fencer_connect_timer = ++ mainloop_timer_add("controld_fencer_connect", 1000, ++ TRUE, controld_timer_fencer_connect, ++ GINT_TO_POINTER(TRUE)); ++ } ++ + if (rc != pcmk_ok) { + if (pcmk_is_set(controld_globals.fsa_input_register, + R_ST_REQUIRED)) { + crm_notice("Fencer connection failed (will retry): %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); +- mainloop_set_trigger(stonith_reconnect); +- } else { ++ ++ if (!mainloop_timer_running(controld_fencer_connect_timer)) { ++ mainloop_timer_start(controld_fencer_connect_timer); ++ } ++ ++ return G_SOURCE_CONTINUE; ++ } else { + crm_info("Fencer connection failed (ignoring because no longer required): %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); + } +- return TRUE; ++ return G_SOURCE_CONTINUE; + } + } + +@@ -709,23 +724,7 @@ te_connect_stonith(gpointer user_data) + crm_notice("Fencer successfully connected"); + } + +- return TRUE; +-} +- +-/*! +- \internal +- \brief Schedule fencer connection attempt in main loop +-*/ +-void +-controld_trigger_fencer_connect(void) +-{ +- if (stonith_reconnect == NULL) { +- stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, +- te_connect_stonith, +- GINT_TO_POINTER(TRUE)); +- } +- controld_set_fsa_input_flags(R_ST_REQUIRED); +- mainloop_set_trigger(stonith_reconnect); ++ return G_SOURCE_REMOVE; + } + + void +@@ -745,9 +744,9 @@ controld_disconnect_fencer(bool destroy) + stonith_api->cmds->free(stonith_api); + stonith_api = NULL; + } +- if (stonith_reconnect) { +- mainloop_destroy_trigger(stonith_reconnect); +- stonith_reconnect = NULL; ++ if (controld_fencer_connect_timer) { ++ mainloop_timer_del(controld_fencer_connect_timer); ++ controld_fencer_connect_timer = NULL; + } + if (te_client_id) { + free(te_client_id); +@@ -981,7 +980,7 @@ controld_execute_fence_action(pcmk__graph_t *graph, + priority_delay ? priority_delay : ""); + + /* Passing NULL means block until we can connect... */ +- te_connect_stonith(NULL); ++ controld_timer_fencer_connect(NULL); + + pcmk__scan_min_int(priority_delay, &delay_i, 0); + rc = fence_with_delay(target, type, delay_i); +diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h +index 86a5050..76779c6 100644 +--- a/daemons/controld/controld_fencing.h ++++ b/daemons/controld/controld_fencing.h +@@ -19,7 +19,7 @@ void controld_configure_fencing(GHashTable *options); + void st_fail_count_reset(const char * target); + + // stonith API client +-void controld_trigger_fencer_connect(void); ++gboolean controld_timer_fencer_connect(gpointer user_data); + void controld_disconnect_fencer(bool destroy); + int controld_execute_fence_action(pcmk__graph_t *graph, + pcmk__graph_action_t *action); +-- +2.27.0 + diff --git a/pacemaker.spec b/pacemaker.spec index 7c06e11..ceb58eb 100644 --- a/pacemaker.spec +++ b/pacemaker.spec @@ -17,7 +17,7 @@ ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) %global pcmkversion 2.1.6 -%global specversion 5 +%global specversion 6 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build %global commit 6fdc9deea294bbad629b003c6ae036aaed8e3ee0 @@ -155,7 +155,7 @@ Patch0: 0001-Fix-glib-assertions.patch Patch1: 0001-Add-the-parameter-of-dampening-and-fix-attrd_updater.patch Patch2: 0001-Add-the-parameter-of-dampening-and-fix-attrd_updater-HealthIOWait.patch Patch3: Fix-libcrmcommon-wait-for-reply-from-appropriate-con.patch - +Patch4: Refactor-fencer-sleep-1s-between-reconnects.patch # upstream commits Requires: resource-agents @@ -775,6 +775,9 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Thu Aug 31 2023 bizhiyuan - 2.1.6-6 +- Refactor: fencer: sleep 1s between reconnects + * Fri Aug 25 2023 zhanghan - 2.1.6-5 - Fix: libcrmcommon: wait for reply from appropriate controller commands -- Gitee