From bc7a32b25f48b0e909aea7f74d389bf373140978 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Fri, 7 Mar 2025 14:26:40 +0800 Subject: [PATCH] =?UTF-8?q?[clusterd]ranktable=E7=94=9F=E6=88=90=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E6=97=B6=E4=BF=9D=E5=AD=98=E4=B8=B4=E7=BB=88=E9=81=97?= =?UTF-8?q?=E8=A8=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/interface/grpc/service/controller.go | 5 +++-- .../grpc/service/fault_recover_platform_plugin.go | 13 ++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/component/clusterd/pkg/interface/grpc/service/controller.go b/component/clusterd/pkg/interface/grpc/service/controller.go index 5553191bc..63bec511a 100644 --- a/component/clusterd/pkg/interface/grpc/service/controller.go +++ b/component/clusterd/pkg/interface/grpc/service/controller.go @@ -278,7 +278,8 @@ func (ctl *EventController) supportDumpStrategy() bool { if !ctl.jobInfo.PlatFormMode || !agentSupport { return agentSupport } - if ctl.platStrategy == constant.ProcessDumpStrategyName { + if ctl.platStrategy == constant.ProcessRecoverStrategyName || + ctl.platStrategy == constant.ProcessDumpStrategyName { return true } return false @@ -808,7 +809,7 @@ func (ctl *EventController) handleNotifyDecidedStrategy() (string, common.RespCo hwlog.RunLog.Infof("finish wait plat rankTable ready, jobId=%s, pgName=%s, err=%v", ctl.jobInfo.JobId, ctl.jobInfo.PgName, err) if err != nil { - return common.WaitRankTableReadyTimeoutEvent, common.ServerInnerError, nil + signal.ChangeStrategy = ctl.chooseForRecoverFail() } } return ctl.signalEnqueue(signal) diff --git a/component/clusterd/pkg/interface/grpc/service/fault_recover_platform_plugin.go b/component/clusterd/pkg/interface/grpc/service/fault_recover_platform_plugin.go index 43f5a75fd..d23037bf4 100644 --- a/component/clusterd/pkg/interface/grpc/service/fault_recover_platform_plugin.go +++ b/component/clusterd/pkg/interface/grpc/service/fault_recover_platform_plugin.go @@ -139,27 +139,30 @@ func WaitProcessResultFault(name, namespace string) ([]*pb.FaultRank, error) { return common.RemoveSliceDuplicateFaults(append(resultRanks, confirmRanks...)), err } -func rankTableReady(name, namespace string) bool { +func rankTableReady(name, namespace string) string { pg, err := kube.RetryGetPodGroup(name, namespace, constant.GetPodGroupTimes) if err != nil { hwlog.RunLog.Errorf("failed to get pg when check rankTableReady, err:%s,name:%s", err, name) - return false + return "" } if pg.Annotations == nil { pg.Annotations = make(map[string]string) } ready, ok := pg.Annotations[constant.RankTableReadyKey] if !ok { - return false + return "" } - return ready == strconv.FormatBool(true) + return ready } // WaitRankTableReady block process until RankTableReady is true func WaitRankTableReady(name, namespace string) error { startTime := time.Now().Unix() ready := rankTableReady(name, namespace) - for !ready { + for ready != strconv.FormatBool(true) { + if ready != "" { + return fmt.Errorf("check %s RankTableReady, RankTable ready failed", name) + } time.Sleep(constant.CheckPeriod * time.Second) timeUse := time.Now().Unix() - startTime if timeUse > constant.ProcessControlTimeout { -- Gitee