From 9b64006fcc8759324654e0e8b01e603cfb46b8a6 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Mon, 7 Jul 2025 16:42:29 +0800 Subject: [PATCH 1/2] fix --- .../recover/fault_recover_service.go | 105 +++++++++--------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/component/clusterd/pkg/application/recover/fault_recover_service.go b/component/clusterd/pkg/application/recover/fault_recover_service.go index 4612812e4..4f35f05b8 100644 --- a/component/clusterd/pkg/application/recover/fault_recover_service.go +++ b/component/clusterd/pkg/application/recover/fault_recover_service.go @@ -16,7 +16,6 @@ import ( "clusterd/pkg/application/faultmanager" "clusterd/pkg/common/constant" "clusterd/pkg/domain/common" - "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/job" "clusterd/pkg/domain/podgroup" "clusterd/pkg/interface/grpc/recover" @@ -81,28 +80,28 @@ func (s *FaultRecoverService) notifyFaultInfoForJob(faultInfo constant.JobFaultI return } var grpcFormatFaults []*pb.FaultRank = nil - for _, info := range faultInfo.FaultList { - if info.PodUid == "" || info.PodRank == "" { - hwlog.RunLog.Warnf("invalid pod info, podId=%s, podRank=%s", - info.PodUid, info.PodRank) - continue - } - faultPod := make(map[string]string) - faultPod[info.PodRank] = info.PodUid - controller.mergeFaultPod(faultPod) - hwlog.RunLog.Debugf("mergeFaultPod: %v", faultPod) - fault := &pb.FaultRank{ - RankId: info.RankId, - } - fault.FaultType = constant.NormalFaultType - if info.DoStepRetry && faultdomain.IsUceFault(info.FaultCode) { - fault.FaultType = constant.UceFaultType - } - if info.DoStepRetry && faultdomain.IsHcclRetryFault(info.FaultCode) { - fault.FaultType = constant.HcclFaultType - } - grpcFormatFaults = append(grpcFormatFaults, fault) - } + //for _, info := range faultInfo.FaultList { + // if info.PodUid == "" || info.PodRank == "" { + // hwlog.RunLog.Warnf("invalid pod info, podId=%s, podRank=%s", + // info.PodUid, info.PodRank) + // continue + // } + // faultPod := make(map[string]string) + // faultPod[info.PodRank] = info.PodUid + // controller.mergeFaultPod(faultPod) + // hwlog.RunLog.Debugf("mergeFaultPod: %v", faultPod) + // fault := &pb.FaultRank{ + // RankId: info.RankId, + // } + // fault.FaultType = constant.NormalFaultType + // if info.DoStepRetry && faultdomain.IsUceFault(info.FaultCode) { + // fault.FaultType = constant.UceFaultType + // } + // if info.DoStepRetry && faultdomain.IsHcclRetryFault(info.FaultCode) { + // fault.FaultType = constant.HcclFaultType + // } + // grpcFormatFaults = append(grpcFormatFaults, fault) + //} hwlog.RunLog.Infof("jobId=%s, fault center fault info change format to grpcFormat, faults=%s", controller.jobInfo.JobId, common.Faults2String(grpcFormatFaults)) controller.saveCacheFault(grpcFormatFaults) @@ -392,36 +391,36 @@ func (s *FaultRecoverService) ReportProcessFault(ctx context.Context, requestInfo := fmt.Sprintf("jobId=%s, faultRanks={%s}", request.JobId, common.Faults2String(request.FaultRanks)) hwlog.RunLog.Infof("receive ReportProcessFault, info={%s}", requestInfo) - controller, exist := s.getController(request.JobId) - if !exist { - hwlog.RunLog.Errorf("jobId=%s not registed", request.JobId) - return &pb.Status{ - Code: int32(common.UnRegistry), - Info: fmt.Sprintf("jobId=%s not registed", request.JobId), - }, nil - } - controller.saveCacheFault(request.FaultRanks) - var err error - faultReason := getFaultReason(request.FaultRanks) - faultPod, err := common.LabelFaultPod(request.JobId, - common.Faults2Ranks(request.FaultRanks), controller.GetFaultPod(), faultReason) - controller.mergeFaultPod(faultPod) - if err != nil { - hwlog.RunLog.Errorf("failed to label soft fault label, err:%v, jobId=%s", - err, request.JobId) - } - if !common.IsRetryFault(request.FaultRanks) { - // when config only support dump strategy, in order to be able to dump directly, set healthState to UnHealthy - controller.healthState = constant.UnHealthyState - controller.restartFaultProcess = common.CanRestartFaultProcess(request.JobId, nil) - controller.addEvent(common.FaultOccurEvent) - } else { - if faultmanager.GlobalFaultProcessCenter != nil { - giveSoftFault2FaultCenter(request.JobId, request.FaultRanks) - } else { - hwlog.RunLog.Warnf("global fault center is nil") - } - } + //controller, exist := s.getController(request.JobId) + //if !exist { + // hwlog.RunLog.Errorf("jobId=%s not registed", request.JobId) + // return &pb.Status{ + // Code: int32(common.UnRegistry), + // Info: fmt.Sprintf("jobId=%s not registed", request.JobId), + // }, nil + //} + //controller.saveCacheFault(request.FaultRanks) + //var err error + //faultReason := getFaultReason(request.FaultRanks) + //faultPod, err := common.LabelFaultPod(request.JobId, + // common.Faults2Ranks(request.FaultRanks), controller.GetFaultPod(), faultReason) + //controller.mergeFaultPod(faultPod) + //if err != nil { + // hwlog.RunLog.Errorf("failed to label soft fault label, err:%v, jobId=%s", + // err, request.JobId) + //} + //if !common.IsRetryFault(request.FaultRanks) { + // // when config only support dump strategy, in order to be able to dump directly, set healthState to UnHealthy + // controller.healthState = constant.UnHealthyState + // controller.restartFaultProcess = common.CanRestartFaultProcess(request.JobId, nil) + // controller.addEvent(common.FaultOccurEvent) + //} else { + // if faultmanager.GlobalFaultProcessCenter != nil { + // giveSoftFault2FaultCenter(request.JobId, request.FaultRanks) + // } else { + // hwlog.RunLog.Warnf("global fault center is nil") + // } + //} return &pb.Status{ Code: int32(common.OK), Info: "receive ReportProcessFault", -- Gitee From 771005560b8c74af17183257ef7dcaef98be7b05 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Mon, 7 Jul 2025 19:48:52 +0800 Subject: [PATCH 2/2] fix --- component/clusterd/pkg/application/recover/controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/component/clusterd/pkg/application/recover/controller.go b/component/clusterd/pkg/application/recover/controller.go index c1989c9b8..722e6805b 100644 --- a/component/clusterd/pkg/application/recover/controller.go +++ b/component/clusterd/pkg/application/recover/controller.go @@ -600,6 +600,7 @@ func (ctl *EventController) handleNotifyWaitFaultFlushing() (string, common.Resp return common.NotifyFinishEvent, common.OperateConfigMapError, nil } hwlog.RunLog.Infof("write configmap FaultFlushing success, %s", cm.Data[constant.ResetInfoCMDataKey]) + time.Sleep(time.Second * 30) return common.NotifyFinishEvent, common.OK, nil } -- Gitee