From 8a27dfbf31f20347c3822c454d9bb75db953f33e Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Tue, 15 Apr 2025 09:35:39 +0800 Subject: [PATCH 01/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20=E5=AE=9A=E4=B9=89AdvanceFaultCo?= =?UTF-8?q?nfigmap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/clusterd/go.mod | 2 +- .../clusterd/pkg/common/constant/type.go | 3 +++ .../pkg/domain/faultdomain/fault_utils.go | 22 +++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/component/clusterd/go.mod b/component/clusterd/go.mod index 76e7dc4b1..eba3d3abe 100644 --- a/component/clusterd/go.mod +++ b/component/clusterd/go.mod @@ -7,6 +7,7 @@ require ( github.com/agiledragon/gomonkey/v2 v2.8.0 github.com/fsnotify/fsnotify v1.6.0 github.com/golang/protobuf v1.5.3 + github.com/pkg/errors v0.9.1 github.com/smartystreets/goconvey v1.7.2 github.com/stretchr/testify v1.8.0 golang.org/x/time v0.3.0 @@ -41,7 +42,6 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/smartystreets/assertions v1.2.0 // indirect github.com/spf13/pflag v1.0.5 // indirect diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index 6cd3cb85e..e3a716c4b 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -464,3 +464,6 @@ type FaultNum struct { NodeFaultNum int `json:"-"` PubFaultNum int `json:"publicFaultNum"` } + +type AdvanceFaultConfigmap[T ConfigMapInterface] interface { +} diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 41992b4d7..2c601610e 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -57,6 +57,28 @@ func nodeNameToCmName(nodeName string) string { return constant.DeviceInfoPrefix + nodeName } +func GetAdvanceFaultCm[T constant.ConfigMapInterface]( + cmInfos map[string]T) map[string]constant.AdvanceFaultConfigmap[T] { + result := make(map[string]constant.AdvanceFaultConfigmap[T]) + for _, info := range cmInfos { + result[CmNameToNodeName(info.GetCmName())] = GetAdvanceFaultForNode(info) + } + return result +} + +func GetAdvanceFaultForNode(cmForNode constant.ConfigMapInterface) constant.AdvanceFaultConfigmap[constant.ConfigMapInterface] { + switch cmForNode.(type) { + case *constant.DeviceInfo: + return GetAdvanceDeviceCm(cmForNode.(*constant.DeviceInfo)) + case *constant.NodeInfo: + return cmForNode + case *constant.SwitchInfo: + return cmForNode + } + hwlog.RunLog.Errorf("cmForNode type is not support.") + return nil +} + // GetAdvanceDeviceCmForNodeMap get advance device cm for node map func GetAdvanceDeviceCmForNodeMap( deviceInfoCms map[string]*constant.DeviceInfo) map[string]constant.AdvanceDeviceFaultCm { -- Gitee From 7393aa63806d9f7ca77e01f6d35d1cb03e6ff6a0 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Wed, 16 Apr 2025 16:33:05 +0800 Subject: [PATCH 02/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20cmprocess=E6=95=B4=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cmprocess/base_fault_center.go | 12 +- .../publicfault/pub_fault_processor.go | 65 +------ .../cmprocess/uce/uce_fault_processor.go | 20 ++- .../uce_accompany_fault_processor.go | 20 +-- .../clusterd/pkg/common/constant/const.go | 12 ++ .../clusterd/pkg/common/constant/type.go | 33 +++- .../pkg/domain/faultdomain/fault_utils.go | 170 ++++++++++++++---- 7 files changed, 215 insertions(+), 117 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go index bfabccc84..e092ae093 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go @@ -10,6 +10,7 @@ import ( "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" + "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/faultdomain/cmmanager" ) @@ -46,17 +47,16 @@ func (baseCenter *baseFaultCenter[T]) Process() { } baseCenter.lastProcessTime = currentTime updateOriginalCm := baseCenter.updateOriginalCm() - baseCenter.setProcessingCm(baseCenter.getOriginalCm()) + advanceCm := faultdomain.GetAdvanceFaultCm(baseCenter.getOriginalCm()) for _, processor := range baseCenter.processorList { - processingCm := baseCenter.getProcessingCm() info := constant.OneConfigmapContent[T]{ - AllConfigmap: processingCm, + AllConfigmap: advanceCm, UpdateConfigmap: updateOriginalCm, } - processingCm = processor.Process(info).(constant.OneConfigmapContent[T]).AllConfigmap - baseCenter.setProcessingCm(processingCm) + advanceCm = processor.Process(info).(constant.OneConfigmapContent[T]).AllConfigmap } - if baseCenter.setProcessedCm(baseCenter.getProcessingCm()) { + originalFaultCm := faultdomain.AdvanceFaultCmToOriginalFaultCm(advanceCm) + if baseCenter.setProcessedCm(originalFaultCm) { baseCenter.notifySubscriber() } } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index 9e8f8156e..ea6ea0ff1 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -4,13 +4,11 @@ package publicfault import ( - "encoding/json" "strconv" "strings" "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" - "clusterd/pkg/common/util" "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/publicfault" ) @@ -20,13 +18,13 @@ var PubFaultProcessor *pubFaultProcessor type pubFaultProcessor struct { pubFaultInfo map[string]*constant.PubFaultCache - devCMInfo *constant.DeviceInfo + devCMInfo *constant.AdvanceDeviceFaultCm } func init() { PubFaultProcessor = &pubFaultProcessor{ pubFaultInfo: make(map[string]*constant.PubFaultCache), - devCMInfo: &constant.DeviceInfo{}, + devCMInfo: &constant.AdvanceDeviceFaultCm{}, } } @@ -54,38 +52,19 @@ func (p *pubFaultProcessor) Process(info any) any { continue } p.pubFaultInfo = pubFaults - p.devCMInfo = devCMInfo + p.devCMInfo = devCMInfo.(*constant.AdvanceDeviceFaultCm) p.faultJoin(nodeName) } processContent.AllConfigmap = deviceInfos return processContent } -func (p *pubFaultProcessor) faultJoin(nodeName string) []constant.DeviceFault { - faultKey, faultList := faultdomain.GetFaultListInfo(p.devCMInfo) - var dpCMFaults []constant.DeviceFault - if err := json.Unmarshal([]byte(faultList), &dpCMFaults); err != nil { - hwlog.RunLog.Errorf("unmarshal fault list for node <%s> failed, error: %v", nodeName, err) - return nil - } - devType := faultdomain.GetDeviceType(p.devCMInfo) - - var newFaultList []constant.DeviceFault - if err := util.DeepCopy(&newFaultList, &dpCMFaults); err != nil { - hwlog.RunLog.Errorf("deep copy device cm faults failed, err: %v", err) - return nil - } - - dpNPUFaultLevelMap := make(map[string]string) - for _, dpCMFault := range newFaultList { - dpNPUFaultLevelMap[dpCMFault.NPUName] = dpCMFault.FaultLevel - } - +func (p *pubFaultProcessor) faultJoin(nodeName string) { for _, pubFaultCache := range p.pubFaultInfo { // add public fault to fault list - pubFaultCache.FaultDevNames = convertNPUIdsToName(pubFaultCache.FaultDevIds, devType) + pubFaultCache.FaultDevNames = convertNPUIdsToName(pubFaultCache.FaultDevIds, p.devCMInfo.ServerType) for _, faultDevName := range pubFaultCache.FaultDevNames { - newFaultList = append(newFaultList, constant.DeviceFault{ + fault := constant.DeviceFault{ FaultType: constant.PublicFaultType, NPUName: faultDevName, LargeModelFaultLevel: pubFaultCache.FaultLevel, @@ -97,32 +76,11 @@ func (p *pubFaultProcessor) faultJoin(nodeName string) []constant.DeviceFault { FaultTime: pubFaultCache.FaultTime, FaultLevel: pubFaultCache.FaultLevel, }}, - }) - } - - for _, pubFaultDev := range pubFaultCache.FaultDevNames { - // public fault id does not exist in dp cm - faultLevel, ok := dpNPUFaultLevelMap[pubFaultDev] - if !ok { - p.updateAvailAndUnhealthy(pubFaultCache.FaultLevel, pubFaultDev) - continue } - // public fault id existed in dp cm - seriousLevel := faultdomain.GetMostSeriousFaultLevel([]string{pubFaultCache.FaultLevel, faultLevel}) - p.updateAvailAndUnhealthy(seriousLevel, pubFaultDev) + p.devCMInfo.FaultDeviceList[nodeName] = append(p.devCMInfo.FaultDeviceList[nodeName], fault) } } - p.updateFaultList(newFaultList, faultKey) - return newFaultList -} - -func (p *pubFaultProcessor) updateFaultList(newFaultList []constant.DeviceFault, faultKey string) { - faultListData, err := json.Marshal(newFaultList) - if err != nil { - hwlog.RunLog.Errorf("marshal device fault list failed, error: %v", err) - return - } - p.devCMInfo.DeviceList[faultKey] = string(faultListData) + faultdomain.FixUnhealthyInfo(p.devCMInfo) } func convertNPUIdsToName(phyIds []int32, devType string) []string { @@ -133,10 +91,3 @@ func convertNPUIdsToName(phyIds []int32, devType string) []string { } return npuNames } - -func (p *pubFaultProcessor) updateAvailAndUnhealthy(faultLevel string, NPUName string) { - if faultLevel == constant.SeparateNPU { - faultdomain.DelDevFromAvailList(p.devCMInfo, []string{NPUName}) - faultdomain.AddDevFromUnhealthyList(p.devCMInfo, []string{NPUName}) - } -} diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go index be65346d0..f100494b9 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go @@ -31,7 +31,7 @@ type uceFaultProcessor struct { // node->DeviceName->uceDeviceInfo uceDeviceOfNode map[string]constant.UceNodeInfo jobServerInfoMap constant.JobServerInfoMap - nodeDeviceCmMap map[string]constant.AdvanceDeviceFaultCm + nodeDeviceCmMap map[string]*constant.AdvanceDeviceFaultCm } func init() { @@ -77,11 +77,14 @@ func (processor *uceFaultProcessor) Process(info any) any { hwlog.RunLog.Errorf("%v cannot convert to DeviceInfo", info) return info } - deviceInfos := processContent.AllConfigmap processor.jobServerInfoMap = job.GetJobServerInfoMap() - processor.nodeDeviceCmMap = faultdomain.GetAdvanceDeviceCmForNodeMap(deviceInfos) - hwlog.RunLog.Debugf("current deviceInfos %s", util.ObjToString(deviceInfos)) + for key, val := range processContent.AllConfigmap { + processor.nodeDeviceCmMap[key], ok = val.(*constant.AdvanceDeviceFaultCm) + if !ok { + hwlog.RunLog.Errorf("processContent.AllConfigmap's value type is not AdvanceDeviceFaultCm") + } + } hwlog.RunLog.Debugf("current nodeDeviceCmMap %s", util.ObjToString(processor.nodeDeviceCmMap)) processor.uceDeviceOfNode = processor.getUceDeviceOfNodes() @@ -94,10 +97,8 @@ func (processor *uceFaultProcessor) Process(info any) any { hwlog.RunLog.Debugf("currentTime %d", currentTime) processor.processUceFaultInfo(currentTime) - faultdomain.AdvanceDeviceCmForNodeMapToString(processor.nodeDeviceCmMap, deviceInfos) - hwlog.RunLog.Debugf("result deviceInfos %s", util.ObjToString(deviceInfos)) - processContent.AllConfigmap = deviceInfos + hwlog.RunLog.Debugf("result deviceInfos %s", util.ObjToString(processContent.AllConfigmap)) return processContent } @@ -109,7 +110,7 @@ func (processor *uceFaultProcessor) processUceFaultInfo(currentTime int64) { } func (processor *uceFaultProcessor) processEachNodeUceFaultInfo( - nodeName string, deviceInfo constant.AdvanceDeviceFaultCm, currentTime int64) constant.AdvanceDeviceFaultCm { + nodeName string, deviceInfo *constant.AdvanceDeviceFaultCm, currentTime int64) *constant.AdvanceDeviceFaultCm { for _, uceJob := range processor.uceDevicesOfUceJob { for deviceName, uceDevice := range uceJob.UceNode[nodeName].DeviceInfo { log := fmt.Sprintf("filter uce device: %s on node %s, "+ @@ -222,7 +223,8 @@ func (processor *uceFaultProcessor) getUceDevicesForUceTolerateJobs() map[string return uceJobs } -func (processor *uceFaultProcessor) getUceFaultDevices(nodeName string, deviceInfo constant.AdvanceDeviceFaultCm) constant.UceNodeInfo { +func (processor *uceFaultProcessor) getUceFaultDevices( + nodeName string, deviceInfo *constant.AdvanceDeviceFaultCm) constant.UceNodeInfo { nodeInfo := constant.UceNodeInfo{ NodeName: nodeName, DeviceInfo: make(map[string]constant.UceDeviceInfo), diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go index a4179076e..51195ff96 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go @@ -27,7 +27,7 @@ type uceAccompanyFaultProcessor struct { uceAccompanyFaultQue map[string]map[string][]constant.DeviceFault // uceFaultTime uceFaultTime map[string]map[string]int64 - deviceCmForNodeMap map[string]constant.AdvanceDeviceFaultCm + deviceCmForNodeMap map[string]*constant.AdvanceDeviceFaultCm } func init() { @@ -45,7 +45,7 @@ func (processor *uceAccompanyFaultProcessor) uceAccompanyFaultInQue() { } func (processor *uceAccompanyFaultProcessor) uceAccompanyFaultInQueForNode( - nodeName string, deviceInfo constant.AdvanceDeviceFaultCm) { + nodeName string, deviceInfo *constant.AdvanceDeviceFaultCm) { if _, ok := processor.uceAccompanyFaultQue[nodeName]; !ok { processor.uceAccompanyFaultQue[nodeName] = make(map[string][]constant.DeviceFault) } @@ -169,19 +169,19 @@ func (processor *uceAccompanyFaultProcessor) Process(info any) any { hwlog.RunLog.Errorf("%v cannot convert to DeviceInfo", info) return info } - deviceInfos := processContent.AllConfigmap - processor.deviceCmForNodeMap = faultdomain.GetAdvanceDeviceCmForNodeMap(deviceInfos) - hwlog.RunLog.Debugf("current deviceInfos: %s", util.ObjToString(deviceInfos)) - hwlog.RunLog.Debugf("current deviceCmForNodeMap: %s", util.ObjToString(processor.deviceCmForNodeMap)) + for key, val := range processContent.AllConfigmap { + processor.deviceCmForNodeMap[key], ok = val.(*constant.AdvanceDeviceFaultCm) + if !ok { + hwlog.RunLog.Errorf("processContent.AllConfigmap's value type is not AdvanceDeviceFaultCm") + } + } + + hwlog.RunLog.Debugf("current deviceInfos: %s", util.ObjToString(processContent.AllConfigmap)) processor.uceAccompanyFaultInQue() hwlog.RunLog.Debugf("current uceAccompanyFaultQue: %s", util.ObjToString(processor.uceAccompanyFaultQue)) currentTime := time.Now().UnixMilli() processor.filterFaultInfos(currentTime) - faultdomain.AdvanceDeviceCmForNodeMapToString(processor.deviceCmForNodeMap, deviceInfos) - - hwlog.RunLog.Debugf("uceAccompanyFaultProcessor result: %s", util.ObjToString(deviceInfos)) - processContent.AllConfigmap = deviceInfos return processContent } diff --git a/component/clusterd/pkg/common/constant/const.go b/component/clusterd/pkg/common/constant/const.go index e67f0e2f1..b2bbd079e 100644 --- a/component/clusterd/pkg/common/constant/const.go +++ b/component/clusterd/pkg/common/constant/const.go @@ -179,6 +179,18 @@ const ( SubHealthFault = "SubHealthFault" ) +// About cm keys +const ( + // CmRecoveringSuffix Recovering Suffix + CmRecoveringSuffix = "-Recovering" + // CmCardUnhealthySuffix CardUnhealthy Suffix + CmCardUnhealthySuffix = "-CardUnhealthy" + // CmCardNetworkUnhealthySuffix NetworkUnhealthy Suffix + CmCardNetworkUnhealthySuffix = "-CardNetworkUnhealthy" + // CmFaultListSuffix FaultList Suffix + CmFaultListSuffix = "-Fault" +) + // support device type const ( Ascend910 = "Ascend910" diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index e3a716c4b..c4f6c2073 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -4,6 +4,7 @@ package constant import ( + "ascend-common/api" "ascend-common/common-utils/hwlog" ) @@ -220,11 +221,38 @@ type AdvanceDeviceFaultCm struct { SuperPodID int32 ServerIndex int32 FaultDeviceList map[string][]DeviceFault + AvailableDevices []string + Recovering []string CardUnHealthy []string NetworkUnhealthy []string UpdateTime int64 } +// GetCmName return cm name +func (cm *AdvanceDeviceFaultCm) GetCmName() string { + return cm.CmName +} + +// GetRecoveringKey return cm RecoveringKey +func (cm *AdvanceDeviceFaultCm) GetRecoveringKey() string { + return api.ResourceNamePrefix + cm.ServerType + CmRecoveringSuffix +} + +// GetCardUnHealthyKey return cm CardUnHealthyKey +func (cm *AdvanceDeviceFaultCm) GetCardUnHealthyKey() string { + return api.ResourceNamePrefix + cm.ServerType + CmCardUnhealthySuffix +} + +// GetNetworkUnhealthyKey return cm NetworkUnhealthyKey +func (cm *AdvanceDeviceFaultCm) GetNetworkUnhealthyKey() string { + return api.ResourceNamePrefix + cm.ServerType + CmCardNetworkUnhealthySuffix +} + +// GetFaultDeviceListKey return cm FaultDeviceListKey +func (cm *AdvanceDeviceFaultCm) GetFaultDeviceListKey() string { + return api.ResourceNamePrefix + cm.ServerType + CmFaultListSuffix +} + // InformerCmItem informer configmap item of queue or buffer type InformerCmItem[T ConfigMapInterface] struct { IsAdd bool @@ -233,7 +261,7 @@ type InformerCmItem[T ConfigMapInterface] struct { // OneConfigmapContent contains one kind of configmap content type OneConfigmapContent[T ConfigMapInterface] struct { - AllConfigmap map[string]T + AllConfigmap map[string]AdvanceFaultConfigmap[T] UpdateConfigmap []InformerCmItem[T] } @@ -465,5 +493,8 @@ type FaultNum struct { PubFaultNum int `json:"publicFaultNum"` } +// AdvanceFaultConfigmap more usable faultConfigmap type AdvanceFaultConfigmap[T ConfigMapInterface] interface { + // GetCmName return cm name + GetCmName() string } diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 2c601610e..9c9e023bc 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -66,14 +66,14 @@ func GetAdvanceFaultCm[T constant.ConfigMapInterface]( return result } -func GetAdvanceFaultForNode(cmForNode constant.ConfigMapInterface) constant.AdvanceFaultConfigmap[constant.ConfigMapInterface] { - switch cmForNode.(type) { +func GetAdvanceFaultForNode[T constant.ConfigMapInterface](cmForNode T) constant.AdvanceFaultConfigmap[T] { + switch cm := any(cmForNode).(type) { case *constant.DeviceInfo: - return GetAdvanceDeviceCm(cmForNode.(*constant.DeviceInfo)) + return GetAdvanceDeviceCm(cm) case *constant.NodeInfo: - return cmForNode + return cm case *constant.SwitchInfo: - return cmForNode + return cm } hwlog.RunLog.Errorf("cmForNode type is not support.") return nil @@ -81,8 +81,8 @@ func GetAdvanceFaultForNode(cmForNode constant.ConfigMapInterface) constant.Adva // GetAdvanceDeviceCmForNodeMap get advance device cm for node map func GetAdvanceDeviceCmForNodeMap( - deviceInfoCms map[string]*constant.DeviceInfo) map[string]constant.AdvanceDeviceFaultCm { - advanceDeviceCmForNodeMap := make(map[string]constant.AdvanceDeviceFaultCm) + deviceInfoCms map[string]*constant.DeviceInfo) map[string]*constant.AdvanceDeviceFaultCm { + advanceDeviceCmForNodeMap := make(map[string]*constant.AdvanceDeviceFaultCm) for _, deviceInfo := range deviceInfoCms { advanceDeviceCmForNodeMap[CmNameToNodeName(deviceInfo.CmName)] = GetAdvanceDeviceCm(deviceInfo) } @@ -90,8 +90,8 @@ func GetAdvanceDeviceCmForNodeMap( } // GetAdvanceDeviceCm deviceName->faults -func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) constant.AdvanceDeviceFaultCm { - advanceDeviceCm := constant.AdvanceDeviceFaultCm{ +func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFaultCm { + advanceDeviceCm := &constant.AdvanceDeviceFaultCm{ CmName: devInfo.CmName, SuperPodID: devInfo.SuperPodID, ServerIndex: devInfo.ServerIndex, @@ -137,9 +137,34 @@ func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) constant.AdvanceDeviceFaul } advanceDeviceCm.CardUnHealthy = cardList } + advanceDeviceCm.AvailableDevices = getAvailableDevices(devInfo) + advanceDeviceCm.Recovering = getRecoveringDevList(devInfo) return advanceDeviceCm } +func getAvailableDevices(devInfo *constant.DeviceInfo) []string { + var cardList []string + _, devs := GetAvailDevListInfo(devInfo) + if len(devs) == 0 { + cardList = make([]string, 0) + } else { + cardList = strings.Split(devs, ",") + } + return cardList +} + +func getRecoveringDevList(devInfo *constant.DeviceInfo) []string { + var cardList []string + if recoveringList, ok := devInfo.DeviceList[GetRecoveringKey(devInfo)]; ok { + if len(recoveringList) == 0 { + cardList = make([]string, 0) + } else { + cardList = strings.Split(recoveringList, ",") + } + } + return cardList +} + // GetDeviceType get device type from device info func GetDeviceType(devInfo *constant.DeviceInfo) string { for key, _ := range devInfo.DeviceList { @@ -278,32 +303,62 @@ func AddFaultIntoFaultMap(faultMap map[string][]constant.DeviceFault, return faultMap } -// AdvanceDeviceCmForNodeMapToString convert advance device cm to original format -func AdvanceDeviceCmForNodeMapToString( - advanceDeviceCm map[string]constant.AdvanceDeviceFaultCm, orgDeviceCm map[string]*constant.DeviceInfo) { - for nodeName, advanceCm := range advanceDeviceCm { - advanceCm = mergeCodeAndRemoveUnhealthy(advanceCm) - cmName := nodeNameToCmName(nodeName) - deviceInfo, found := orgDeviceCm[cmName] - if !found { - continue - } - faultListKey := GetFaultListKey(deviceInfo) - if faultListKey != "" { - orgDeviceCm[cmName].DeviceList[faultListKey] = - util.ObjToString(faultMapToFaultList(advanceCm.FaultDeviceList)) - } +func AdvanceFaultCmToOriginalFaultCm[T constant.ConfigMapInterface]( + advanceFaultCm map[string]constant.AdvanceFaultConfigmap[T]) map[string]T { + result := make(map[string]T) + for _, advanceCmForNode := range advanceFaultCm { + result[advanceCmForNode.GetCmName()] = AdvanceFaultCmToOriginalCmForNode(advanceCmForNode).(T) + } + return result +} - networkUnhealthyKey := GetNetworkUnhealthyKey(deviceInfo) - if networkUnhealthyKey != "" { - orgDeviceCm[cmName].DeviceList[networkUnhealthyKey] = strings.Join(advanceCm.NetworkUnhealthy, ",") - } +func AdvanceFaultCmToOriginalCmForNode[T constant.ConfigMapInterface]( + advanceCmForNode constant.AdvanceFaultConfigmap[T]) constant.ConfigMapInterface { + switch cm := advanceCmForNode.(type) { + case *constant.AdvanceDeviceFaultCm: + return AdvanceDeviceFaultCmToOriginalCmForNode(cm) + case *constant.SwitchInfo: + return cm + case *constant.NodeInfo: + return cm + } + hwlog.RunLog.Errorf("AdvanceFaultCmToOriginalCmForNode don't support this type.") + return nil +} - cardUnhealthyKey := GetCardUnhealthyKey(deviceInfo) - if cardUnhealthyKey != "" { - orgDeviceCm[cmName].DeviceList[cardUnhealthyKey] = strings.Join(advanceCm.CardUnHealthy, ",") - } +// AdvanceDeviceFaultCmToOriginalCmForNode convert advance device cm to original format +func AdvanceDeviceFaultCmToOriginalCmForNode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) *constant.DeviceInfo { + orgDeviceCm := &constant.DeviceInfo{ + DeviceInfoNoName: constant.DeviceInfoNoName{}, + CmName: advanceDeviceCm.CmName, + SuperPodID: advanceDeviceCm.SuperPodID, + ServerIndex: advanceDeviceCm.ServerIndex, + } + + FixUnhealthyInfo(advanceDeviceCm) + mergeCode(advanceDeviceCm) + + orgDeviceCm.DeviceList[advanceDeviceCm.GetFaultDeviceListKey()] = + util.ObjToString(faultMapToFaultList(advanceDeviceCm.FaultDeviceList)) + + orgDeviceCm.DeviceList[advanceDeviceCm.GetNetworkUnhealthyKey()] = "" + if len(advanceDeviceCm.NetworkUnhealthy) > 0 { + orgDeviceCm.DeviceList[advanceDeviceCm.GetNetworkUnhealthyKey()] = + strings.Join(advanceDeviceCm.NetworkUnhealthy, ",") } + + orgDeviceCm.DeviceList[advanceDeviceCm.GetCardUnHealthyKey()] = "" + if len(advanceDeviceCm.CardUnHealthy) > 0 { + orgDeviceCm.DeviceList[advanceDeviceCm.GetCardUnHealthyKey()] = + strings.Join(advanceDeviceCm.CardUnHealthy, ",") + } + + orgDeviceCm.DeviceList[advanceDeviceCm.GetRecoveringKey()] = "" + if len(advanceDeviceCm.Recovering) > 0 { + orgDeviceCm.DeviceList[advanceDeviceCm.GetRecoveringKey()] = + strings.Join(advanceDeviceCm.Recovering, ",") + } + return orgDeviceCm } func faultMapToFaultList(deviceFaultMap map[string][]constant.DeviceFault) []constant.DeviceFault { @@ -335,7 +390,7 @@ func isFaultDeletable(faults []constant.DeviceFault, faultTypes []string, faultL return true } -func mergeCodeAndRemoveUnhealthy(advanceDeviceCm constant.AdvanceDeviceFaultCm) constant.AdvanceDeviceFaultCm { +func removeUnhealthy(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { for deviceName, faults := range advanceDeviceCm.FaultDeviceList { deletableFaultLevels := []string{constant.NotHandleFault, constant.SubHealthFault} if isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { @@ -346,6 +401,11 @@ func mergeCodeAndRemoveUnhealthy(advanceDeviceCm constant.AdvanceDeviceFaultCm) advanceDeviceCm.NetworkUnhealthy = util.DeleteStringSliceItem(advanceDeviceCm.NetworkUnhealthy, deviceName) hwlog.RunLog.Debugf("remove device %s from NetworkUnhealthy", deviceName) } + } +} + +func mergeCode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { + for deviceName, faults := range advanceDeviceCm.FaultDeviceList { if len(faults) == 0 { continue } @@ -356,7 +416,39 @@ func mergeCodeAndRemoveUnhealthy(advanceDeviceCm constant.AdvanceDeviceFaultCm) } advanceDeviceCm.FaultDeviceList[deviceName] = mergedFaults } - return advanceDeviceCm +} + +func addUnhealthy(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { + for deviceName, faults := range advanceDeviceCm.FaultDeviceList { + shouldAddInCardUnhealthy := false + shouldAddInCardNetworkUnhealthy := false + for _, fault := range faults { + if fault.FaultType == constant.CardUnhealthy || fault.FaultType == constant.PublicFaultType { + if fault.FaultLevel != constant.NormalNPU && fault.FaultLevel != constant.NotHandleFault { + shouldAddInCardUnhealthy = true + } + } + if fault.FaultType == constant.CardNetworkUnhealthy { + if fault.FaultLevel != constant.NormalNetwork && fault.FaultLevel != constant.NotHandleFault { + shouldAddInCardNetworkUnhealthy = true + } + } + } + if shouldAddInCardNetworkUnhealthy { + advanceDeviceCm.NetworkUnhealthy = append(advanceDeviceCm.NetworkUnhealthy, deviceName) + } + if shouldAddInCardUnhealthy { + advanceDeviceCm.CardUnHealthy = append(advanceDeviceCm.CardUnHealthy, deviceName) + } + if shouldAddInCardUnhealthy || shouldAddInCardNetworkUnhealthy { + advanceDeviceCm.AvailableDevices = util.DeleteStringSliceItem(advanceDeviceCm.AvailableDevices, deviceName) + } + } +} + +func FixUnhealthyInfo(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { + removeUnhealthy(advanceDeviceCm) + addUnhealthy(advanceDeviceCm) } // GetFaultListKey get FaultList key in DeviceInfo @@ -389,6 +481,16 @@ func GetCardUnhealthyKey(devInfo *constant.DeviceInfo) string { return "" } +// GetRecoveringKey get Card recovering key in DeviceInfo +func GetRecoveringKey(devInfo *constant.DeviceInfo) string { + for key, _ := range devInfo.DeviceList { + if strings.Contains(key, constant.NPUPreName) && strings.Contains(key, "-Recovering") { + return key + } + } + return "" +} + // GetFaultListInfo get fault list info func GetFaultListInfo(devCMInfo *constant.DeviceInfo) (string, string) { for faultKey, faultInfo := range devCMInfo.DeviceList { -- Gitee From afbf24194c0b7c7e01a5b8f125d5efff2718d0c8 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Wed, 16 Apr 2025 17:19:41 +0800 Subject: [PATCH 03/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20fault=20utils=E6=95=B4=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/domain/faultdomain/fault_utils.go | 179 +++++------------- 1 file changed, 52 insertions(+), 127 deletions(-) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 9c9e023bc..428e070e3 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -7,7 +7,6 @@ import ( "encoding/json" "fmt" "reflect" - "sort" "strings" "time" @@ -53,10 +52,7 @@ func CmNameToNodeName(cmName string) string { return strings.TrimPrefix(cmName, constant.DeviceInfoPrefix) } -func nodeNameToCmName(nodeName string) string { - return constant.DeviceInfoPrefix + nodeName -} - +// GetAdvanceFaultCm return more usable fault cm func GetAdvanceFaultCm[T constant.ConfigMapInterface]( cmInfos map[string]T) map[string]constant.AdvanceFaultConfigmap[T] { result := make(map[string]constant.AdvanceFaultConfigmap[T]) @@ -66,6 +62,7 @@ func GetAdvanceFaultCm[T constant.ConfigMapInterface]( return result } +// GetAdvanceFaultForNode return more usable fault cm for one node func GetAdvanceFaultForNode[T constant.ConfigMapInterface](cmForNode T) constant.AdvanceFaultConfigmap[T] { switch cm := any(cmForNode).(type) { case *constant.DeviceInfo: @@ -89,7 +86,7 @@ func GetAdvanceDeviceCmForNodeMap( return advanceDeviceCmForNodeMap } -// GetAdvanceDeviceCm deviceName->faults +// GetAdvanceDeviceCm return more usable device cm func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFaultCm { advanceDeviceCm := &constant.AdvanceDeviceFaultCm{ CmName: devInfo.CmName, @@ -98,13 +95,23 @@ func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFau UpdateTime: devInfo.UpdateTime, ServerType: GetDeviceType(devInfo), } - if faultList, ok := devInfo.DeviceList[GetFaultListKey(devInfo)]; ok { + advanceDeviceCm.FaultDeviceList = getFaultListInfo(devInfo) + advanceDeviceCm.NetworkUnhealthy = getNetworkUnhealthyCardList(devInfo) + advanceDeviceCm.CardUnHealthy = getCardUnHealthy(devInfo) + advanceDeviceCm.AvailableDevices = getAvailableDevices(devInfo) + advanceDeviceCm.Recovering = getRecoveringDevList(devInfo) + return advanceDeviceCm +} + +func getFaultListInfo(devInfo *constant.DeviceInfo) map[string][]constant.DeviceFault { + _, faultList := getFaultListString(devInfo) + if len(faultList) > 0 { var devicesFault []constant.DeviceFault err := json.Unmarshal([]byte(faultList), &devicesFault) if err != nil { hwlog.RunLog.Errorf("get fault list for node %v failed. "+ "Json unmarshall exception: %v", devInfo.CmName, err) - return advanceDeviceCm + return make(map[string][]constant.DeviceFault) } deviceFaultMap := make(map[string][]constant.DeviceFault) for _, deviceFault := range devicesFault { @@ -117,52 +124,43 @@ func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFau deviceFaults := splitDeviceFault(deviceFault, CmNameToNodeName(devInfo.CmName)) deviceFaultMap[deviceFault.NPUName] = append(deviceFaultMap[deviceFault.NPUName], deviceFaults...) } - advanceDeviceCm.FaultDeviceList = deviceFaultMap + return deviceFaultMap } else { hwlog.RunLog.Infof("get fault list for node %v failed. fault list does not exist", devInfo.CmName) + return make(map[string][]constant.DeviceFault) } - if networkUnhealthyCardList, ok := devInfo.DeviceList[GetNetworkUnhealthyKey(devInfo)]; ok { - cardList := strings.Split(networkUnhealthyCardList, ",") - advanceDeviceCm.NetworkUnhealthy = cardList - } else { - hwlog.RunLog.Infof("get NetworkUnhealthy list for node %v failed. fault list does not exist", - devInfo.CmName) +} + +func getCardUnHealthy(devInfo *constant.DeviceInfo) []string { + _, info := getCardUnhealthyString(devInfo) + if len(info) == 0 { + return make([]string, 0) } - if cardUnhealthyCardList, ok := devInfo.DeviceList[GetCardUnhealthyKey(devInfo)]; ok { - var cardList []string - if len(cardUnhealthyCardList) == 0 { - cardList = make([]string, 0) - } else { - cardList = strings.Split(cardUnhealthyCardList, ",") - } - advanceDeviceCm.CardUnHealthy = cardList + return strings.Split(info, ",") +} + +func getNetworkUnhealthyCardList(devInfo *constant.DeviceInfo) []string { + _, info := getNetworkUnhealthyString(devInfo) + if len(info) == 0 { + return make([]string, 0) } - advanceDeviceCm.AvailableDevices = getAvailableDevices(devInfo) - advanceDeviceCm.Recovering = getRecoveringDevList(devInfo) - return advanceDeviceCm + return strings.Split(info, ",") } func getAvailableDevices(devInfo *constant.DeviceInfo) []string { - var cardList []string - _, devs := GetAvailDevListInfo(devInfo) - if len(devs) == 0 { - cardList = make([]string, 0) - } else { - cardList = strings.Split(devs, ",") + _, info := getAvailDevListString(devInfo) + if len(info) == 0 { + return make([]string, 0) } - return cardList + return strings.Split(info, ",") } func getRecoveringDevList(devInfo *constant.DeviceInfo) []string { - var cardList []string - if recoveringList, ok := devInfo.DeviceList[GetRecoveringKey(devInfo)]; ok { - if len(recoveringList) == 0 { - cardList = make([]string, 0) - } else { - cardList = strings.Split(recoveringList, ",") - } + _, info := getRecoveringString(devInfo) + if len(info) == 0 { + return make([]string, 0) } - return cardList + return strings.Split(info, ",") } // GetDeviceType get device type from device info @@ -451,58 +449,27 @@ func FixUnhealthyInfo(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { addUnhealthy(advanceDeviceCm) } -// GetFaultListKey get FaultList key in DeviceInfo -func GetFaultListKey(devInfo *constant.DeviceInfo) string { - for key, _ := range devInfo.DeviceList { - if strings.Contains(key, constant.NPUPreName) && strings.Contains(key, "-Fault") { - return key - } - } - return "" -} - -// GetNetworkUnhealthyKey get networkUnhealthy key in DeviceInfo -func GetNetworkUnhealthyKey(devInfo *constant.DeviceInfo) string { - for key, _ := range devInfo.DeviceList { - if strings.Contains(key, constant.NPUPreName) && strings.Contains(key, "-NetworkUnhealthy") { - return key - } - } - return "" +func getNetworkUnhealthyString(devInfo *constant.DeviceInfo) (string, string) { + key := api.ResourceNamePrefix + GetDeviceType(devInfo) + constant.CmCardNetworkUnhealthySuffix + return key, devInfo.DeviceList[key] } -// GetCardUnhealthyKey get CardUnhealthy key in DeviceInfo -func GetCardUnhealthyKey(devInfo *constant.DeviceInfo) string { - for key, _ := range devInfo.DeviceList { - if strings.Contains(key, constant.NPUPreName) && strings.Contains(key, "-Unhealthy") { - return key - } - } - return "" +func getCardUnhealthyString(devInfo *constant.DeviceInfo) (string, string) { + key := api.ResourceNamePrefix + GetDeviceType(devInfo) + constant.CmCardUnhealthySuffix + return key, devInfo.DeviceList[key] } -// GetRecoveringKey get Card recovering key in DeviceInfo -func GetRecoveringKey(devInfo *constant.DeviceInfo) string { - for key, _ := range devInfo.DeviceList { - if strings.Contains(key, constant.NPUPreName) && strings.Contains(key, "-Recovering") { - return key - } - } - return "" +func getRecoveringString(devInfo *constant.DeviceInfo) (string, string) { + key := api.ResourceNamePrefix + GetDeviceType(devInfo) + constant.CmRecoveringSuffix + return key, devInfo.DeviceList[key] } -// GetFaultListInfo get fault list info -func GetFaultListInfo(devCMInfo *constant.DeviceInfo) (string, string) { - for faultKey, faultInfo := range devCMInfo.DeviceList { - if strings.Contains(faultKey, constant.NPUPreName) && strings.Contains(faultKey, "-Fault") { - return faultKey, faultInfo - } - } - return "", "" +func getFaultListString(devInfo *constant.DeviceInfo) (string, string) { + key := api.ResourceNamePrefix + GetDeviceType(devInfo) + constant.CmFaultListSuffix + return key, devInfo.DeviceList[key] } -// GetAvailDevListInfo get available device list info -func GetAvailDevListInfo(devCMInfo *constant.DeviceInfo) (string, string) { +func getAvailDevListString(devCMInfo *constant.DeviceInfo) (string, string) { availKey := api.ResourceNamePrefix + GetDeviceType(devCMInfo) availDevList, ok := devCMInfo.DeviceList[availKey] if !ok { @@ -511,48 +478,6 @@ func GetAvailDevListInfo(devCMInfo *constant.DeviceInfo) (string, string) { return availKey, availDevList } -// DelDevFromAvailList delete device from available device list -func DelDevFromAvailList(devCMInfo *constant.DeviceInfo, npuNames []string) { - availKey, availList := GetAvailDevListInfo(devCMInfo) - if len(availList) == 0 { - return - } - splitList := strings.Split(availList, ",") - for _, npuName := range npuNames { - splitList = util.DeleteStringSliceItem(splitList, npuName) - } - devCMInfo.DeviceList[availKey] = strings.Join(splitList, ",") - return -} - -// GetUnhealthyListInfo get unhealthy list info -func GetUnhealthyListInfo(devCMInfo *constant.DeviceInfo) (string, []string) { - for unHealthyKey, unHealthyCards := range devCMInfo.DeviceList { - if strings.Contains(unHealthyKey, constant.NPUPreName) && strings.Contains(unHealthyKey, "-Unhealthy") { - var cardList []string - if len(unHealthyCards) == 0 { - cardList = make([]string, 0) - } else { - cardList = strings.Split(unHealthyCards, ",") - } - return unHealthyKey, cardList - } - } - return "", []string{} -} - -// AddDevFromUnhealthyList add device from unhealthy list -func AddDevFromUnhealthyList(devCMInfo *constant.DeviceInfo, npuNames []string) { - unHealthyKey, unHealthyList := GetUnhealthyListInfo(devCMInfo) - for _, npuName := range npuNames { - if !util.IsSliceContain(npuName, unHealthyList) { - unHealthyList = append(unHealthyList, npuName) - } - } - sort.Strings(unHealthyList) - devCMInfo.DeviceList[unHealthyKey] = strings.Join(unHealthyList, ",") -} - // IsUceFault check faultCode is uce func IsUceFault(faultCode string) bool { if strings.Contains(faultCode, constant.UceFaultCode) { -- Gitee From b02393cbdd3bc57edfaa754cde18dccf55406e0e Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Fri, 18 Apr 2025 16:21:16 +0800 Subject: [PATCH 04/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20cmprocess=20=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=95=B4=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../publicfault/pub_fault_processor.go | 6 +-- .../publicfault/pub_fault_processor_test.go | 44 ++++++++++++++----- .../cmprocess/uce/uce_fault_processor_test.go | 17 ++++--- .../uce_accompany_fault_processor.go | 6 ++- .../uce_accompany_fault_processor_test.go | 16 +++---- .../faultrank/job_fault_rank_processor.go | 4 +- .../job_fault_rank_processor_test.go | 6 +-- .../relation_fault_process_test.go | 10 ++--- .../relationfault/relation_fault_processor.go | 2 +- .../pkg/domain/faultdomain/fault_utils.go | 12 +++-- .../domain/faultdomain/fault_utils_test.go | 16 ------- .../resource/pub_fault_processor_test.yaml | 8 ++-- 12 files changed, 82 insertions(+), 65 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index ea6ea0ff1..dec09d1f8 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -53,13 +53,13 @@ func (p *pubFaultProcessor) Process(info any) any { } p.pubFaultInfo = pubFaults p.devCMInfo = devCMInfo.(*constant.AdvanceDeviceFaultCm) - p.faultJoin(nodeName) + p.faultJoin() } processContent.AllConfigmap = deviceInfos return processContent } -func (p *pubFaultProcessor) faultJoin(nodeName string) { +func (p *pubFaultProcessor) faultJoin() { for _, pubFaultCache := range p.pubFaultInfo { // add public fault to fault list pubFaultCache.FaultDevNames = convertNPUIdsToName(pubFaultCache.FaultDevIds, p.devCMInfo.ServerType) @@ -77,7 +77,7 @@ func (p *pubFaultProcessor) faultJoin(nodeName string) { FaultLevel: pubFaultCache.FaultLevel, }}, } - p.devCMInfo.FaultDeviceList[nodeName] = append(p.devCMInfo.FaultDeviceList[nodeName], fault) + p.devCMInfo.FaultDeviceList[faultDevName] = append(p.devCMInfo.FaultDeviceList[faultDevName], fault) } } faultdomain.FixUnhealthyInfo(p.devCMInfo) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go index 318e02597..c3f7d52aa 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go @@ -4,11 +4,14 @@ package publicfault import ( + "sort" "testing" "github.com/smartystreets/goconvey/convey" + "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" + "clusterd/pkg/common/util" "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/publicfault" ) @@ -50,7 +53,7 @@ func TestProcessor(t *testing.T) { func testNilCache() { resetFaultCache() ori := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: oriDevInfo1, + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1), UpdateConfigmap: nil, } res := PubFaultProcessor.Process(ori) @@ -68,7 +71,7 @@ func testNodeNameInvalid() { resetFaultCache() publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName3, faultKey1) content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: oriDevInfo1, + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1), UpdateConfigmap: nil, } exp := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) @@ -79,13 +82,15 @@ func testDiff() { resetFaultCache() publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName1, faultKey1) content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: oriDevInfo1, + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1), UpdateConfigmap: nil, } resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) - ori := faultdomain.GetAdvanceDeviceCmForNodeMap(resContent.AllConfigmap) - exp := faultdomain.GetAdvanceDeviceCmForNodeMap(expDeviceInfo1) - convey.So(ori, convey.ShouldResemble, exp) + sortDeviceFaultList(resContent.AllConfigmap) + want := faultdomain.GetAdvanceFaultCm(expDeviceInfo1) + sortDeviceFaultList(want) + result := resContent.AllConfigmap + convey.So(result, convey.ShouldResemble, want) } func testCommon() { @@ -94,13 +99,17 @@ func testCommon() { testCacheData.FaultDevIds = []int32{0, card5} publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName2, faultKey2) content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: oriDevInfo2, + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo2), UpdateConfigmap: nil, } resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) - res := faultdomain.GetAdvanceDeviceCmForNodeMap(resContent.AllConfigmap) - exp := faultdomain.GetAdvanceDeviceCmForNodeMap(expDeviceInfo2) - convey.So(res, convey.ShouldResemble, exp) + hwlog.RunLog.Infof(util.ObjToString(resContent.AllConfigmap)) + hwlog.RunLog.Infof(util.ObjToString(faultdomain.GetAdvanceFaultCm(expDeviceInfo2))) + sortDeviceFaultList(resContent.AllConfigmap) + result := resContent.AllConfigmap + want := faultdomain.GetAdvanceFaultCm(expDeviceInfo2) + sortDeviceFaultList(want) + convey.So(result, convey.ShouldResemble, want) } func resetFaultCache() { @@ -108,3 +117,18 @@ func resetFaultCache() { delete(publicfault.PubFaultCache.GetPubFault(), nodeName) } } + +func sortDeviceFaultList(advanceFaultCm map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo]) { + for _, deviceCm := range advanceFaultCm { + advanceDeviceCm := deviceCm.(*constant.AdvanceDeviceFaultCm) + for _, fault := range advanceDeviceCm.FaultDeviceList { + sort.Slice(fault, func(i, j int) bool { + return util.MakeDataHash(fault[i]) < util.MakeDataHash(fault[j]) + }) + } + sort.Strings(advanceDeviceCm.CardUnHealthy) + sort.Strings(advanceDeviceCm.NetworkUnhealthy) + sort.Strings(advanceDeviceCm.Recovering) + sort.Strings(advanceDeviceCm.AvailableDevices) + } +} diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go index 3a29a4140..078ba5506 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go @@ -329,11 +329,10 @@ func TestUceFaultProcessorProcessUceFaultInfo(t *testing.T) { UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() currentTime := 109 * time.Second.Milliseconds() UceProcessor.processUceFaultInfo(currentTime) - faultdomain.AdvanceDeviceCmForNodeMapToString(UceProcessor.nodeDeviceCmMap, cmDeviceInfos) - result := faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) + result := UceProcessor.nodeDeviceCmMap want := faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos) if !reflect.DeepEqual(result, want) { - t.Errorf("processUceFaultInfo() = %v, want %v", + t.Errorf("result:\n%v\n\nwant:\n%v", util.ObjToString(result), util.ObjToString(want)) } }) @@ -354,11 +353,10 @@ func TestUceFaultProcessorScenario1(t *testing.T) { UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() currentTime := 100 * time.Second.Milliseconds() UceProcessor.processUceFaultInfo(currentTime) - faultdomain.AdvanceDeviceCmForNodeMapToString(UceProcessor.nodeDeviceCmMap, cmDeviceInfos) - result := faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) + result := UceProcessor.nodeDeviceCmMap want := faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos) if !reflect.DeepEqual(result, want) { - t.Errorf("processUceFaultInfo() = %v, want %v", + t.Errorf("processUceFaultInfo() = %v, \n\nwant %v", util.ObjToString(result), util.ObjToString(want)) } }) @@ -371,8 +369,9 @@ func TestUceFaultProcessorScenario2(t *testing.T) { if testFileErr != nil { t.Errorf("init data failed. %v", testFileErr) } + content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: cmDeviceInfos, + AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos), UpdateConfigmap: []constant.InformerCmItem[*constant.DeviceInfo]{ { IsAdd: false, @@ -397,8 +396,8 @@ func TestUceFaultProcessorScenario2(t *testing.T) { }() resultContent := UceProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) - result := faultdomain.GetAdvanceDeviceCmForNodeMap(resultContent.AllConfigmap) - want := faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos) + result := resultContent.AllConfigmap + want := faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos) if !reflect.DeepEqual(result, want) { t.Errorf("processUceFaultInfo() = %v, want %v", util.ObjToString(result), util.ObjToString(want)) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go index 51195ff96..86ff5f92b 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go @@ -33,6 +33,7 @@ type uceAccompanyFaultProcessor struct { func init() { UceAccompanyProcessor = &uceAccompanyFaultProcessor{ DiagnosisAccompanyTimeout: constant.DiagnosisAccompanyTimeout, + deviceCmForNodeMap: make(map[string]*constant.AdvanceDeviceFaultCm), uceAccompanyFaultQue: make(map[string]map[string][]constant.DeviceFault), uceFaultTime: make(map[string]map[string]int64), } @@ -101,7 +102,10 @@ func (processor *uceAccompanyFaultProcessor) inQue(nodeName, deviceName string, func (processor *uceAccompanyFaultProcessor) filterFaultInfos(currentTime int64) { for nodeName, nodeFaults := range processor.uceAccompanyFaultQue { - faultMap := processor.deviceCmForNodeMap[nodeName] + faultMap, found := processor.deviceCmForNodeMap[nodeName] + if !found { + continue + } for deviceName, deviceFaultQue := range nodeFaults { newQue, newFaultMap := processor.filterFaultDevice(faultMap.FaultDeviceList, currentTime, nodeName, deviceName, deviceFaultQue) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go index 951406d5e..1fe083d09 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go @@ -39,8 +39,7 @@ func TestUceAccompanyFaultProcessorProcess(t *testing.T) { UceAccompanyProcessor.deviceCmForNodeMap = faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) UceAccompanyProcessor.uceAccompanyFaultInQue() UceAccompanyProcessor.filterFaultInfos(CurrentTime) - faultdomain.AdvanceDeviceCmForNodeMapToString(UceAccompanyProcessor.deviceCmForNodeMap, cmDeviceInfos) - if !reflect.DeepEqual(faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos), + if !reflect.DeepEqual(UceAccompanyProcessor.deviceCmForNodeMap, faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos)) { t.Errorf("result = %v, want %v", util.ObjToString(cmDeviceInfos), util.ObjToString(expectProcessedDeviceInfos)) @@ -61,7 +60,7 @@ func TestUceAccompanyFaultProcessorProcessE2E(t *testing.T) { t.Errorf("init data failed. %v", testFileErr) } content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: cmDeviceInfos, + AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos), UpdateConfigmap: []constant.InformerCmItem[*constant.DeviceInfo]{{}}, } mockTime := time.Time{} @@ -77,10 +76,11 @@ func TestUceAccompanyFaultProcessorProcessE2E(t *testing.T) { }() resultContent := UceAccompanyProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) - if !reflect.DeepEqual(faultdomain.GetAdvanceDeviceCmForNodeMap(resultContent.AllConfigmap), - faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos)) { - t.Errorf("result = %v, want %v", - util.ObjToString(cmDeviceInfos), util.ObjToString(expectProcessedDeviceInfos)) + if !reflect.DeepEqual(resultContent.AllConfigmap, + faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos)) { + t.Errorf("result:\n%v\nwant:\n%v", + util.ObjToString(resultContent.AllConfigmap), + util.ObjToString(faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos))) } if len(UceAccompanyProcessor.uceAccompanyFaultQue["node1"]["Ascend910-1"]) != 1 && @@ -111,7 +111,7 @@ func TestUceAccompanyFaultProcessorProcessForAddFault(t *testing.T) { }, }, } - UceAccompanyProcessor.deviceCmForNodeMap = make(map[string]constant.AdvanceDeviceFaultCm) + UceAccompanyProcessor.deviceCmForNodeMap = map[string]*constant.AdvanceDeviceFaultCm{"node1": {}} UceAccompanyProcessor.filterFaultInfos(CurrentTime) if len(UceAccompanyProcessor.deviceCmForNodeMap[nodeName].FaultDeviceList[deviceName]) != 1 { t.Error("TestUceAccompanyFaultProcessorProcessForAddFault fail") diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go index e40d83df3..b53293b54 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go @@ -74,7 +74,7 @@ func (processor *jobRankFaultInfoProcessor) setJobFaultRankInfos(faultInfos map[ } func (processor *jobRankFaultInfoProcessor) findFaultRankForJob( - nodeDeviceInfoMap map[string]constant.AdvanceDeviceFaultCm, + nodeDeviceInfoMap map[string]*constant.AdvanceDeviceFaultCm, nodeName string, serverList map[string]constant.ServerHccl, jobId string) []constant.FaultRank { advanceDeviceInfo := nodeDeviceInfoMap[nodeName] devicesOfJobOnNode, ok := serverList[nodeName] @@ -190,7 +190,7 @@ func (processor *jobRankFaultInfoProcessor) Process(info any) any { func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault( serverList map[string]constant.ServerHccl, nodeInfos map[string]*constant.NodeInfo, - switchInfos map[string]*constant.SwitchInfo, deviceCmForNodeMap map[string]constant.AdvanceDeviceFaultCm, + switchInfos map[string]*constant.SwitchInfo, deviceCmForNodeMap map[string]*constant.AdvanceDeviceFaultCm, jobId string) ([]constant.FaultRank, []string) { faultList := make([]constant.FaultRank, 0) nodeStatusList := make([]string, 0) diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go index 222c63797..8da116a13 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go @@ -172,7 +172,7 @@ func TestFindFaultRankForJob(t *testing.T) { func testNoDevicesOnNode(processor *jobRankFaultInfoProcessor) { convey.Convey("When no devices on node", func() { - nodeDeviceInfoMap := map[string]constant.AdvanceDeviceFaultCm{ + nodeDeviceInfoMap := map[string]*constant.AdvanceDeviceFaultCm{ "node1": { ServerType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{}, @@ -191,7 +191,7 @@ func testNoDevicesOnNode(processor *jobRankFaultInfoProcessor) { func testUceInManagementPlane(processor *jobRankFaultInfoProcessor) { convey.Convey("When UCE fault in management plane", func() { - nodeDeviceInfoMap := map[string]constant.AdvanceDeviceFaultCm{ + nodeDeviceInfoMap := map[string]*constant.AdvanceDeviceFaultCm{ "node1": { ServerType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{ @@ -224,7 +224,7 @@ func testUceInManagementPlane(processor *jobRankFaultInfoProcessor) { func testUceInBusinessPlane(processor *jobRankFaultInfoProcessor) { convey.Convey("When UCE fault in business plane", func() { - nodeDeviceInfoMap := map[string]constant.AdvanceDeviceFaultCm{ + nodeDeviceInfoMap := map[string]*constant.AdvanceDeviceFaultCm{ "node1": { ServerType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{}, diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go index be30f7e2d..22cade271 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go @@ -480,7 +480,7 @@ func testSuccess(fJob *FaultJob) { func testInitByDeviceFault(fJob *FaultJob) { convey.Convey("When initializing by device fault", func() { cardName := "server-type-device1" - nodeFaultInfo := constant.AdvanceDeviceFaultCm{ + nodeFaultInfo := &constant.AdvanceDeviceFaultCm{ ServerType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{ cardName: { @@ -737,8 +737,8 @@ func TestInitFaultJobs(t *testing.T) { func testEmptyServerList(processor *relationFaultProcessor) { convey.Convey("When server list is empty", func() { patches := gomonkey.ApplyFunc(faultdomain.GetAdvanceDeviceCmForNodeMap, - func(deviceInfoCm map[string]*constant.DeviceInfo) map[string]constant.AdvanceDeviceFaultCm { - return map[string]constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} + func(deviceInfoCm map[string]*constant.DeviceInfo) map[string]*constant.AdvanceDeviceFaultCm { + return map[string]*constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} }) defer patches.Reset() @@ -759,8 +759,8 @@ func testEmptyServerList(processor *relationFaultProcessor) { func testInitFaultJob(processor *relationFaultProcessor) { convey.Convey("When initializing fault job", func() { patches := gomonkey.ApplyFunc(faultdomain.GetAdvanceDeviceCmForNodeMap, - func(deviceInfoCm map[string]*constant.DeviceInfo) map[string]constant.AdvanceDeviceFaultCm { - return map[string]constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} + func(deviceInfoCm map[string]*constant.DeviceInfo) map[string]*constant.AdvanceDeviceFaultCm { + return map[string]*constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} }) defer patches.Reset() diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go index 71206d401..b0d55f379 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go @@ -320,7 +320,7 @@ func (fJob *FaultJob) addFaultStrategyForTimeOutCode(fault *constant.FaultInfo) } } -func (fJob *FaultJob) initByDeviceFault(nodeFaultInfo constant.AdvanceDeviceFaultCm, serverList constant.ServerHccl) { +func (fJob *FaultJob) initByDeviceFault(nodeFaultInfo *constant.AdvanceDeviceFaultCm, serverList constant.ServerHccl) { if fJob.SeparateNodes.Has(serverList.ServerName) { return } diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 428e070e3..02452e5cb 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -273,7 +273,11 @@ func DeleteFaultFromFaultMap(faultMap map[string][]constant.DeviceFault, } newDeviceFaults = append(newDeviceFaults, fault) } - faultMap[delFault.NPUName] = newDeviceFaults + if len(newDeviceFaults) == 0 { + delete(faultMap, delFault.NPUName) + } else { + faultMap[delFault.NPUName] = newDeviceFaults + } return faultMap } @@ -433,10 +437,12 @@ func addUnhealthy(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { } } if shouldAddInCardNetworkUnhealthy { - advanceDeviceCm.NetworkUnhealthy = append(advanceDeviceCm.NetworkUnhealthy, deviceName) + advanceDeviceCm.NetworkUnhealthy = + sets.NewString(advanceDeviceCm.NetworkUnhealthy...).Insert(deviceName).List() } if shouldAddInCardUnhealthy { - advanceDeviceCm.CardUnHealthy = append(advanceDeviceCm.CardUnHealthy, deviceName) + advanceDeviceCm.CardUnHealthy = + sets.NewString(advanceDeviceCm.CardUnHealthy...).Insert(deviceName).List() } if shouldAddInCardUnhealthy || shouldAddInCardNetworkUnhealthy { advanceDeviceCm.AvailableDevices = util.DeleteStringSliceItem(advanceDeviceCm.AvailableDevices, deviceName) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index e79d86b28..ae8658464 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -416,22 +416,6 @@ func TestFaultCodeJudge(t *testing.T) { }) } -// TestAdvanceDeviceCmForNodeMapToString should return string-format CM from AdvanceDeviceCm -func TestAdvanceDeviceCmForNodeMapToString(t *testing.T) { - deviceInfoCms := map[string]*constant.DeviceInfo{ - cmName: originalDeviceCm, - } - t.Run("TestAdvanceDeviceCmForNodeMapToString", func(t *testing.T) { - advanceMap := GetAdvanceDeviceCmForNodeMap(deviceInfoCms) - orgDeviceCm := make(map[string]*constant.DeviceInfo) - util.DeepCopy(&orgDeviceCm, deviceInfoCms) - AdvanceDeviceCmForNodeMapToString(advanceMap, orgDeviceCm) - if !reflect.DeepEqual(GetAdvanceDeviceCmForNodeMap(orgDeviceCm), GetAdvanceDeviceCmForNodeMap(deviceInfoCms)) { - t.Error("TestAdvanceDeviceCmForNodeMapToString fail") - } - }) -} - // TestAddFaultAndDeleteFaultMap should add or delete fault right func TestAddFaultAndDeleteFaultMap(t *testing.T) { addFault := constant.DeviceFault{ diff --git a/component/clusterd/testdata/resource/pub_fault_processor_test.yaml b/component/clusterd/testdata/resource/pub_fault_processor_test.yaml index c8783d560..05c212444 100644 --- a/component/clusterd/testdata/resource/pub_fault_processor_test.yaml +++ b/component/clusterd/testdata/resource/pub_fault_processor_test.yaml @@ -9,7 +9,7 @@ mindx-dl-deviceinfo-node1: "large_model_fault_level": "PreSeparateNPU", "fault_level": "PreSeparateNPU", "fault_handling": "PreSeparateNPU", "fault_time_and_level_map": { "4C1F8608": { "fault_time": 1234567890, "fault_level": "PreSeparateNPU" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-5 + huawei.com/Ascend910-CardUnhealthy: Ascend910-5 --- # device info cm after public fault processor. public fault: unhealthy card is Ascend910-0, Ascend910-1 mindx-dl-deviceinfo-node1: @@ -28,7 +28,7 @@ mindx-dl-deviceinfo-node1: "large_model_fault_level": "SeparateNPU", "fault_level": "SeparateNPU", "fault_handling": "SeparateNPU", "fault_time_and_level_map": { "010001001": { "fault_time": 1739866717, "fault_level": "SeparateNPU" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-1,Ascend910-5 + huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-1,Ascend910-5 --- # device info cm: unhealthy card is Ascend910-5 mindx-dl-deviceinfo-node2: @@ -41,7 +41,7 @@ mindx-dl-deviceinfo-node2: "large_model_fault_level": "PreSeparateNPU", "fault_level": "PreSeparateNPU", "fault_handling": "PreSeparateNPU", "fault_time_and_level_map": { "4C1F8608": { "fault_time": 1234567890, "fault_level": "PreSeparateNPU" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-5 + huawei.com/Ascend910-CardUnhealthy: Ascend910-5 --- # device info cm after public fault processor. public fault: unhealthy card is Ascend910-0, Ascend910-5 mindx-dl-deviceinfo-node2: @@ -60,4 +60,4 @@ mindx-dl-deviceinfo-node2: "large_model_fault_level": "SeparateNPU", "fault_level": "SeparateNPU", "fault_handling": "SeparateNPU", "fault_time_and_level_map": { "010001001": { "fault_time": 1739866717, "fault_level": "SeparateNPU" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-5 \ No newline at end of file + huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-5 \ No newline at end of file -- Gitee From a63198014c57d5e4a095f491f006e097294ce53c Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Fri, 18 Apr 2025 21:16:10 +0800 Subject: [PATCH 05/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20jobprocess=20=E6=95=B4=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jobprocess/fault_job_center.go | 7 ++--- .../faultrank/job_fault_rank_processor.go | 27 ++++++++----------- .../job_fault_rank_processor_test.go | 9 ++++--- .../relation_fault_process_test.go | 17 ++++++------ .../relationfault/relation_fault_processor.go | 19 ++++++++----- .../clusterd/pkg/common/constant/type.go | 6 ++--- 6 files changed, 44 insertions(+), 41 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go index 1bd6d380a..464d5563b 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go @@ -10,6 +10,7 @@ import ( "clusterd/pkg/application/faultmanager/jobprocess/faultrank" "clusterd/pkg/application/faultmanager/jobprocess/relationfault" "clusterd/pkg/common/constant" + "clusterd/pkg/domain/faultdomain" ) // FaultJobCenter process fault about job @@ -37,9 +38,9 @@ func (fJobCenter *faultJobProcessCenter) Process() { } fJobCenter.lastProcessTime = currentTime content := constant.AllConfigmapContent{ - DeviceCm: cmprocess.DeviceCenter.GetProcessedCm(), - SwitchCm: cmprocess.SwitchCenter.GetProcessedCm(), - NodeCm: cmprocess.NodeCenter.GetProcessedCm(), + DeviceCm: faultdomain.GetAdvanceFaultCm(cmprocess.DeviceCenter.GetProcessedCm()), + SwitchCm: faultdomain.GetAdvanceFaultCm(cmprocess.SwitchCenter.GetProcessedCm()), + NodeCm: faultdomain.GetAdvanceFaultCm(cmprocess.NodeCenter.GetProcessedCm()), } for _, processor := range fJobCenter.processorList { processor.Process(content) diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go index b53293b54..111fcad1d 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go @@ -74,9 +74,8 @@ func (processor *jobRankFaultInfoProcessor) setJobFaultRankInfos(faultInfos map[ } func (processor *jobRankFaultInfoProcessor) findFaultRankForJob( - nodeDeviceInfoMap map[string]*constant.AdvanceDeviceFaultCm, + advanceDeviceInfo *constant.AdvanceDeviceFaultCm, nodeName string, serverList map[string]constant.ServerHccl, jobId string) []constant.FaultRank { - advanceDeviceInfo := nodeDeviceInfoMap[nodeName] devicesOfJobOnNode, ok := serverList[nodeName] faultRankList := make([]constant.FaultRank, 0) if !ok || len(devicesOfJobOnNode.DeviceList) == 0 { @@ -155,13 +154,7 @@ func (processor *jobRankFaultInfoProcessor) Process(info any) any { hwlog.RunLog.Error("convert info to AllConfigmapContent failed") return info } - deviceInfos := allConfigmap.DeviceCm - deviceCmForNodeMap := faultdomain.GetAdvanceDeviceCmForNodeMap(deviceInfos) - hwlog.RunLog.Debugf("deviceInfos: %#v", deviceInfos) - nodeInfos := allConfigmap.NodeCm - hwlog.RunLog.Debugf("nodeInfos: %#v", nodeInfos) - switchInfos := allConfigmap.SwitchCm - hwlog.RunLog.Debugf("switchInfos: %#v", switchInfos) + hwlog.RunLog.Debugf("allConfigmap info: %#v", util.ObjToString(allConfigmap)) jobFaultInfos := make(map[string]constant.JobFaultInfo) jobServerInfoMap := job.GetJobServerInfoMap() @@ -173,7 +166,7 @@ func (processor *jobRankFaultInfoProcessor) Process(info any) any { } hwlog.RunLog.Debugf("serverList: %d", len(serverList)) faultList, nodeStatusList := processor.findNodeDeviceAndSwitchFault(serverList, - nodeInfos, switchInfos, deviceCmForNodeMap, jobId) + allConfigmap.NodeCm, allConfigmap.SwitchCm, allConfigmap.DeviceCm, jobId) jobFaultInfo.FaultList = faultList if len(jobFaultInfo.FaultList) > 0 { hwlog.RunLog.Debugf("jobFaultInfo: %#v", jobFaultInfo) @@ -188,15 +181,16 @@ func (processor *jobRankFaultInfoProcessor) Process(info any) any { return nil } -func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault( - serverList map[string]constant.ServerHccl, nodeInfos map[string]*constant.NodeInfo, - switchInfos map[string]*constant.SwitchInfo, deviceCmForNodeMap map[string]*constant.AdvanceDeviceFaultCm, +func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault(serverList map[string]constant.ServerHccl, + nodeInfos map[string]constant.AdvanceFaultConfigmap[*constant.NodeInfo], + switchInfos map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo], + deviceCmForNodeMap map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo], jobId string) ([]constant.FaultRank, []string) { faultList := make([]constant.FaultRank, 0) nodeStatusList := make([]string, 0) for nodeName, server := range serverList { hwlog.RunLog.Debugf("nodeName: %s, server: %#v", nodeName, server) - switchInfo, ok := switchInfos[constant.SwitchInfoPrefix+nodeName] + switchInfo, ok := switchInfos[constant.SwitchInfoPrefix+nodeName].(*constant.SwitchInfo) if ok { nodeStatusList = append(nodeStatusList, switchInfo.NodeStatus) } @@ -205,7 +199,7 @@ func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault( faultList = append(faultList, serverHcclToFaultRank(server, jobId)...) continue } - nodeInfo, ok := nodeInfos[constant.NodeInfoPrefix+nodeName] + nodeInfo, ok := nodeInfos[constant.NodeInfoPrefix+nodeName].(*constant.NodeInfo) if ok && nodeInfo.NodeStatus == constant.UnHealthyState { hwlog.RunLog.Debugf("node %s is unhealthy", nodeName) faultList = append(faultList, serverHcclToFaultRank(server, jobId)...) @@ -217,7 +211,8 @@ func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault( faultList = append(faultList, serverHcclToFaultRank(server, jobId)...) continue } - faultRankList := processor.findFaultRankForJob(deviceCmForNodeMap, nodeName, serverList, jobId) + advanceDeviceInfo := deviceCmForNodeMap[nodeName].(*constant.AdvanceDeviceFaultCm) + faultRankList := processor.findFaultRankForJob(advanceDeviceInfo, nodeName, serverList, jobId) faultList = append(faultList, faultRankList...) } return faultList, nodeStatusList diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go index 8da116a13..9b8b66723 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go @@ -184,7 +184,8 @@ func testNoDevicesOnNode(processor *jobRankFaultInfoProcessor) { }, } - faultRanks := processor.findFaultRankForJob(nodeDeviceInfoMap, "node1", serverList, "job1") + faultRanks := processor.findFaultRankForJob( + nodeDeviceInfoMap["node1"], "node1", serverList, "job1") convey.So(faultRanks, convey.ShouldBeEmpty) }) } @@ -215,7 +216,8 @@ func testUceInManagementPlane(processor *jobRankFaultInfoProcessor) { }) defer patches.Reset() - faultRanks := processor.findFaultRankForJob(nodeDeviceInfoMap, "node1", serverList, "job1") + faultRanks := processor.findFaultRankForJob( + nodeDeviceInfoMap["node1"], "node1", serverList, "job1") convey.So(faultRanks, convey.ShouldHaveLength, 1) convey.So(faultRanks[0].FaultCode, convey.ShouldEqual, constant.UceFaultCode) convey.So(faultRanks[0].DoStepRetry, convey.ShouldBeTrue) @@ -249,7 +251,8 @@ func testUceInBusinessPlane(processor *jobRankFaultInfoProcessor) { return true }) - faultRanks := processor.findFaultRankForJob(nodeDeviceInfoMap, "node1", serverList, "job1") + faultRanks := processor.findFaultRankForJob( + nodeDeviceInfoMap["node1"], "node1", serverList, "job1") convey.So(faultRanks, convey.ShouldHaveLength, 1) convey.So(faultRanks[0].FaultCode, convey.ShouldEqual, constant.UceFaultCode) convey.So(faultRanks[0].DoStepRetry, convey.ShouldBeTrue) diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go index 22cade271..704820a33 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go @@ -329,9 +329,9 @@ func testInvalidInfoType(processor *relationFaultProcessor) { func testValidInfoType(processor *relationFaultProcessor) { convey.Convey("When info type is valid", func() { content := constant.AllConfigmapContent{ - DeviceCm: map[string]*constant.DeviceInfo{}, - SwitchCm: map[string]*constant.SwitchInfo{}, - NodeCm: map[string]*constant.NodeInfo{}, + DeviceCm: map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo]{}, + SwitchCm: map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo]{}, + NodeCm: map[string]constant.AdvanceFaultConfigmap[*constant.NodeInfo]{}, } patches := gomonkey.ApplyMethod(processor, "InitFaultJobs", func(_ *relationFaultProcessor) {}) @@ -724,8 +724,7 @@ func testValidConfig() { func TestInitFaultJobs(t *testing.T) { convey.Convey("Test InitFaultJobs", t, func() { processor := &relationFaultProcessor{ - deviceInfoCm: map[string]*constant.DeviceInfo{}, - switchInfoCm: map[string]*constant.SwitchInfo{}, + switchInfoCm: map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo]{}, faultJobs: make(map[string]*FaultJob), } @@ -758,11 +757,11 @@ func testEmptyServerList(processor *relationFaultProcessor) { func testInitFaultJob(processor *relationFaultProcessor) { convey.Convey("When initializing fault job", func() { - patches := gomonkey.ApplyFunc(faultdomain.GetAdvanceDeviceCmForNodeMap, - func(deviceInfoCm map[string]*constant.DeviceInfo) map[string]*constant.AdvanceDeviceFaultCm { - return map[string]*constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} - }) + patches := gomonkey.NewPatches() defer patches.Reset() + deviceInfo := map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo]{ + "node1": &constant.AdvanceDeviceFaultCm{SuperPodID: 1}} + processor.deviceInfoCm = deviceInfo patches.ApplyFunc(job.GetJobServerInfoMap, func() constant.JobServerInfoMap { return constant.JobServerInfoMap{ diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go index b0d55f379..0fdc4e218 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go @@ -45,9 +45,9 @@ func loadConfig() { type relationFaultProcessor struct { faultJobs map[string]*FaultJob - deviceInfoCm map[string]*constant.DeviceInfo - switchInfoCm map[string]*constant.SwitchInfo - nodeInfoCm map[string]*constant.NodeInfo + deviceInfoCm map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo] + switchInfoCm map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo] + nodeInfoCm map[string]constant.AdvanceFaultConfigmap[*constant.NodeInfo] } // Process job network relation fault info @@ -71,7 +71,6 @@ func (processor *relationFaultProcessor) Process(info any) any { } func (processor *relationFaultProcessor) InitFaultJobs() { - deviceCmForNodeMap := faultdomain.GetAdvanceDeviceCmForNodeMap(processor.deviceInfoCm) faultJobs := make(map[string]*FaultJob) jobServerInfoMap := job.GetJobServerInfoMap() for jobId, serverLists := range jobServerInfoMap.InfoMap { @@ -85,11 +84,17 @@ func (processor *relationFaultProcessor) InitFaultJobs() { } tmpFaultJob.initFaultJobAttr() for nodeName, serverList := range serverLists { - tmpFaultJob.IsA3Job = deviceCmForNodeMap[nodeName].SuperPodID >= 0 tmpFaultJob.PodNames[serverList.ServerName] = serverList.PodID tmpFaultJob.NameSpace = serverList.PodNameSpace - tmpFaultJob.initBySwitchFault(processor.switchInfoCm[constant.SwitchInfoPrefix+nodeName], serverList) - tmpFaultJob.initByDeviceFault(deviceCmForNodeMap[nodeName], serverList) + switchInfo, ok := (processor.switchInfoCm[constant.SwitchInfoPrefix+nodeName]).(*constant.SwitchInfo) + if ok { + tmpFaultJob.initBySwitchFault(switchInfo, serverList) + } + deviceInfo, ok := (processor.deviceInfoCm[nodeName]).(*constant.AdvanceDeviceFaultCm) + if ok { + tmpFaultJob.IsA3Job = deviceInfo.SuperPodID >= 0 + tmpFaultJob.initByDeviceFault(deviceInfo, serverList) + } } faultJobs[jobId] = tmpFaultJob hwlog.RunLog.Debugf("init fault job %v", util.ObjToString(faultJobs)) diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index c4f6c2073..da18ed180 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -267,9 +267,9 @@ type OneConfigmapContent[T ConfigMapInterface] struct { // AllConfigmapContent contains all kind of configmap content type AllConfigmapContent struct { - DeviceCm map[string]*DeviceInfo - SwitchCm map[string]*SwitchInfo - NodeCm map[string]*NodeInfo + DeviceCm map[string]AdvanceFaultConfigmap[*DeviceInfo] + SwitchCm map[string]AdvanceFaultConfigmap[*SwitchInfo] + NodeCm map[string]AdvanceFaultConfigmap[*NodeInfo] } // ConfigMapInterface configmap interface -- Gitee From b8a3e4547b670b225c079413def3e3bc0e06ceb4 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Sat, 19 Apr 2025 00:36:43 +0800 Subject: [PATCH 06/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20=E7=A7=BB=E9=99=A4AdvanceFaultCo?= =?UTF-8?q?nfigmap=E6=8E=A5=E5=8F=A3=EF=BC=8Cinformer=E5=BC=80=E5=A7=8B?= =?UTF-8?q?=E5=B0=B1=E4=BD=BF=E7=94=A8AdvanceDeviceFaultCm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cmprocess/base_fault_center.go | 18 +--- .../cmprocess/device_fault_center.go | 2 +- .../publicfault/pub_fault_processor.go | 4 +- .../publicfault/pub_fault_processor_test.go | 34 ++++---- .../cmprocess/uce/uce_fault_processor.go | 9 +- .../cmprocess/uce/uce_fault_processor_test.go | 13 ++- .../uce_accompany_fault_processor.go | 10 +-- .../uce_accompany_fault_processor_test.go | 14 +-- .../faultmanager/fault_process_center.go | 2 +- .../jobprocess/fault_job_center.go | 7 +- .../faultrank/job_fault_rank_processor.go | 12 +-- .../relation_fault_process_test.go | 11 ++- .../relationfault/relation_fault_processor.go | 10 +-- .../pkg/application/resource/report.go | 7 +- .../clusterd/pkg/common/constant/type.go | 20 ++--- .../cmmanager/configmap_manager.go | 52 ++++------- .../cmmanager/configmap_manager_test.go | 16 +--- .../faultdomain/collector/cm_collector.go | 14 +-- .../collector/cm_collector_test.go | 6 +- .../pkg/domain/faultdomain/fault_utils.go | 87 ++++++++++++++++--- 20 files changed, 184 insertions(+), 164 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go index e092ae093..1021295b4 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go @@ -10,7 +10,6 @@ import ( "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" - "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/faultdomain/cmmanager" ) @@ -47,16 +46,15 @@ func (baseCenter *baseFaultCenter[T]) Process() { } baseCenter.lastProcessTime = currentTime updateOriginalCm := baseCenter.updateOriginalCm() - advanceCm := faultdomain.GetAdvanceFaultCm(baseCenter.getOriginalCm()) + processingCm := baseCenter.getOriginalCm() for _, processor := range baseCenter.processorList { info := constant.OneConfigmapContent[T]{ - AllConfigmap: advanceCm, + AllConfigmap: processingCm, UpdateConfigmap: updateOriginalCm, } - advanceCm = processor.Process(info).(constant.OneConfigmapContent[T]).AllConfigmap + processingCm = processor.Process(info).(constant.OneConfigmapContent[T]).AllConfigmap } - originalFaultCm := faultdomain.AdvanceFaultCmToOriginalFaultCm(advanceCm) - if baseCenter.setProcessedCm(originalFaultCm) { + if baseCenter.setProcessedCm(processingCm) { baseCenter.notifySubscriber() } } @@ -96,14 +94,6 @@ func (baseCenter *baseFaultCenter[T]) getOriginalCm() map[string]T { return baseCenter.cmManager.GetOriginalCm().Data } -func (baseCenter *baseFaultCenter[T]) setProcessingCm(cm map[string]T) { - baseCenter.cmManager.SetProcessingCm(cmmanager.ConfigMap[T]{Data: cm}) -} - -func (baseCenter *baseFaultCenter[T]) getProcessingCm() map[string]T { - return baseCenter.cmManager.GetProcessingCm().Data -} - func (baseCenter *baseFaultCenter[T]) setProcessedCm(cm map[string]T) bool { return baseCenter.cmManager.SetProcessedCm(cmmanager.ConfigMap[T]{Data: cm}) } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/device_fault_center.go b/component/clusterd/pkg/application/faultmanager/cmprocess/device_fault_center.go index bcbd0dc91..e1ba0259c 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/device_fault_center.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/device_fault_center.go @@ -16,7 +16,7 @@ var DeviceCenter *deviceFaultProcessCenter // deviceFaultProcessCenter type deviceFaultProcessCenter struct { - baseFaultCenter[*constant.DeviceInfo] + baseFaultCenter[*constant.AdvanceDeviceFaultCm] } func init() { diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index dec09d1f8..212e205bf 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -33,7 +33,7 @@ func (p *pubFaultProcessor) Process(info any) any { if publicfault.PubFaultCache == nil || len(publicfault.PubFaultCache.GetPubFault()) == 0 { return info } - processContent, ok := info.(constant.OneConfigmapContent[*constant.DeviceInfo]) + processContent, ok := info.(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) if !ok { hwlog.RunLog.Error("input is not DeviceInfo type", info) return info @@ -52,7 +52,7 @@ func (p *pubFaultProcessor) Process(info any) any { continue } p.pubFaultInfo = pubFaults - p.devCMInfo = devCMInfo.(*constant.AdvanceDeviceFaultCm) + p.devCMInfo = devCMInfo p.faultJoin() } processContent.AllConfigmap = deviceInfos diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go index c3f7d52aa..3d35dceb0 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go @@ -52,8 +52,8 @@ func TestProcessor(t *testing.T) { func testNilCache() { resetFaultCache() - ori := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1), + ori := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)), UpdateConfigmap: nil, } res := PubFaultProcessor.Process(ori) @@ -70,24 +70,24 @@ func testInputInvalid() { func testNodeNameInvalid() { resetFaultCache() publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName3, faultKey1) - content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1), + content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)), UpdateConfigmap: nil, } - exp := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) + exp := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) convey.So(content, convey.ShouldResemble, exp) } func testDiff() { resetFaultCache() publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName1, faultKey1) - content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1), + content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)), UpdateConfigmap: nil, } - resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) + resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) sortDeviceFaultList(resContent.AllConfigmap) - want := faultdomain.GetAdvanceFaultCm(expDeviceInfo1) + want := faultdomain.GetAdvanceFaultCm(expDeviceInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)) sortDeviceFaultList(want) result := resContent.AllConfigmap convey.So(result, convey.ShouldResemble, want) @@ -98,16 +98,17 @@ func testCommon() { const card5 = 5 testCacheData.FaultDevIds = []int32{0, card5} publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName2, faultKey2) - content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo2), + content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ + AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo2, make(map[string]*constant.AdvanceDeviceFaultCm)), UpdateConfigmap: nil, } - resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) + resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) hwlog.RunLog.Infof(util.ObjToString(resContent.AllConfigmap)) - hwlog.RunLog.Infof(util.ObjToString(faultdomain.GetAdvanceFaultCm(expDeviceInfo2))) + hwlog.RunLog.Infof( + util.ObjToString(faultdomain.GetAdvanceFaultCm(expDeviceInfo2, make(map[string]*constant.AdvanceDeviceFaultCm)))) sortDeviceFaultList(resContent.AllConfigmap) result := resContent.AllConfigmap - want := faultdomain.GetAdvanceFaultCm(expDeviceInfo2) + want := faultdomain.GetAdvanceFaultCm(expDeviceInfo2, make(map[string]*constant.AdvanceDeviceFaultCm)) sortDeviceFaultList(want) convey.So(result, convey.ShouldResemble, want) } @@ -118,9 +119,8 @@ func resetFaultCache() { } } -func sortDeviceFaultList(advanceFaultCm map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo]) { - for _, deviceCm := range advanceFaultCm { - advanceDeviceCm := deviceCm.(*constant.AdvanceDeviceFaultCm) +func sortDeviceFaultList(advanceFaultCm map[string]*constant.AdvanceDeviceFaultCm) { + for _, advanceDeviceCm := range advanceFaultCm { for _, fault := range advanceDeviceCm.FaultDeviceList { sort.Slice(fault, func(i, j int) bool { return util.MakeDataHash(fault[i]) < util.MakeDataHash(fault[j]) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go index f100494b9..6b75194a9 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go @@ -72,19 +72,14 @@ func (processor *uceFaultProcessor) initUceDeviceFromNodeAndReportInfo(jobId str // Process uce fault func (processor *uceFaultProcessor) Process(info any) any { - processContent, ok := info.(constant.OneConfigmapContent[*constant.DeviceInfo]) + processContent, ok := info.(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) if !ok { hwlog.RunLog.Errorf("%v cannot convert to DeviceInfo", info) return info } processor.jobServerInfoMap = job.GetJobServerInfoMap() - for key, val := range processContent.AllConfigmap { - processor.nodeDeviceCmMap[key], ok = val.(*constant.AdvanceDeviceFaultCm) - if !ok { - hwlog.RunLog.Errorf("processContent.AllConfigmap's value type is not AdvanceDeviceFaultCm") - } - } + processor.nodeDeviceCmMap = processContent.AllConfigmap hwlog.RunLog.Debugf("current nodeDeviceCmMap %s", util.ObjToString(processor.nodeDeviceCmMap)) processor.uceDeviceOfNode = processor.getUceDeviceOfNodes() diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go index 078ba5506..50d9753f1 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go @@ -364,15 +364,14 @@ func TestUceFaultProcessorScenario1(t *testing.T) { func TestUceFaultProcessorScenario2(t *testing.T) { t.Run("TestUceFaultProcessorScenario2", func(t *testing.T) { - cmDeviceInfos, expectProcessedDeviceInfos, jobServerInfoMap, reportInfos, testFileErr := + cmDeviceInfos, expProcessedDeviceInfos, jobServerInfoMap, reportInfos, testFileErr := readObjectFromUceScenarioTestYaml() if testFileErr != nil { t.Errorf("init data failed. %v", testFileErr) } - - content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos), - UpdateConfigmap: []constant.InformerCmItem[*constant.DeviceInfo]{ + content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ + AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)), + UpdateConfigmap: []constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]{ { IsAdd: false, Data: nil}, @@ -395,9 +394,9 @@ func TestUceFaultProcessorScenario2(t *testing.T) { mockUnixMilli.Reset() }() - resultContent := UceProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) + resultContent := UceProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) result := resultContent.AllConfigmap - want := faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos) + want := faultdomain.GetAdvanceFaultCm(expProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) if !reflect.DeepEqual(result, want) { t.Errorf("processUceFaultInfo() = %v, want %v", util.ObjToString(result), util.ObjToString(want)) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go index 86ff5f92b..878056d02 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go @@ -168,18 +168,12 @@ func (processor *uceAccompanyFaultProcessor) isCurrentExceedDiagnosisTimeout( // Process uce accompany fault func (processor *uceAccompanyFaultProcessor) Process(info any) any { - processContent, ok := info.(constant.OneConfigmapContent[*constant.DeviceInfo]) + processContent, ok := info.(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) if !ok { hwlog.RunLog.Errorf("%v cannot convert to DeviceInfo", info) return info } - for key, val := range processContent.AllConfigmap { - processor.deviceCmForNodeMap[key], ok = val.(*constant.AdvanceDeviceFaultCm) - if !ok { - hwlog.RunLog.Errorf("processContent.AllConfigmap's value type is not AdvanceDeviceFaultCm") - } - } - + processor.deviceCmForNodeMap = processContent.AllConfigmap hwlog.RunLog.Debugf("current deviceInfos: %s", util.ObjToString(processContent.AllConfigmap)) processor.uceAccompanyFaultInQue() diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go index 1fe083d09..aed74dc59 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go @@ -59,9 +59,9 @@ func TestUceAccompanyFaultProcessorProcessE2E(t *testing.T) { if testFileErr != nil { t.Errorf("init data failed. %v", testFileErr) } - content := constant.OneConfigmapContent[*constant.DeviceInfo]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos), - UpdateConfigmap: []constant.InformerCmItem[*constant.DeviceInfo]{{}}, + content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ + AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)), + UpdateConfigmap: []constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]{{}}, } mockTime := time.Time{} mockUnixMilli := gomonkey.ApplyPrivateMethod(mockTime, "UnixMilli", func() int64 { @@ -74,13 +74,13 @@ func TestUceAccompanyFaultProcessorProcessE2E(t *testing.T) { mockNow.Reset() mockUnixMilli.Reset() }() - resultContent := UceAccompanyProcessor.Process(content).(constant.OneConfigmapContent[*constant.DeviceInfo]) + resultContent := UceAccompanyProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) - if !reflect.DeepEqual(resultContent.AllConfigmap, - faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos)) { + exp := faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + if !reflect.DeepEqual(resultContent.AllConfigmap, exp) { t.Errorf("result:\n%v\nwant:\n%v", util.ObjToString(resultContent.AllConfigmap), - util.ObjToString(faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos))) + util.ObjToString(exp)) } if len(UceAccompanyProcessor.uceAccompanyFaultQue["node1"]["Ascend910-1"]) != 1 && diff --git a/component/clusterd/pkg/application/faultmanager/fault_process_center.go b/component/clusterd/pkg/application/faultmanager/fault_process_center.go index 80e8d48ee..ed021b81f 100644 --- a/component/clusterd/pkg/application/faultmanager/fault_process_center.go +++ b/component/clusterd/pkg/application/faultmanager/fault_process_center.go @@ -104,7 +104,7 @@ func QueryJobsFaultInfo(faultLevel string) map[string]constant.JobFaultInfo { } // QueryDeviceInfoToReport query device info to report -func QueryDeviceInfoToReport() map[string]*constant.DeviceInfo { +func QueryDeviceInfoToReport() map[string]*constant.AdvanceDeviceFaultCm { infos := cmprocess.DeviceCenter.GetProcessedCm() for _, info := range infos { info.UpdateTime = time.Now().Unix() diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go index 464d5563b..1bd6d380a 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go @@ -10,7 +10,6 @@ import ( "clusterd/pkg/application/faultmanager/jobprocess/faultrank" "clusterd/pkg/application/faultmanager/jobprocess/relationfault" "clusterd/pkg/common/constant" - "clusterd/pkg/domain/faultdomain" ) // FaultJobCenter process fault about job @@ -38,9 +37,9 @@ func (fJobCenter *faultJobProcessCenter) Process() { } fJobCenter.lastProcessTime = currentTime content := constant.AllConfigmapContent{ - DeviceCm: faultdomain.GetAdvanceFaultCm(cmprocess.DeviceCenter.GetProcessedCm()), - SwitchCm: faultdomain.GetAdvanceFaultCm(cmprocess.SwitchCenter.GetProcessedCm()), - NodeCm: faultdomain.GetAdvanceFaultCm(cmprocess.NodeCenter.GetProcessedCm()), + DeviceCm: cmprocess.DeviceCenter.GetProcessedCm(), + SwitchCm: cmprocess.SwitchCenter.GetProcessedCm(), + NodeCm: cmprocess.NodeCenter.GetProcessedCm(), } for _, processor := range fJobCenter.processorList { processor.Process(content) diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go index 111fcad1d..541ab1d13 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go @@ -182,15 +182,15 @@ func (processor *jobRankFaultInfoProcessor) Process(info any) any { } func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault(serverList map[string]constant.ServerHccl, - nodeInfos map[string]constant.AdvanceFaultConfigmap[*constant.NodeInfo], - switchInfos map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo], - deviceCmForNodeMap map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo], + nodeInfos map[string]*constant.NodeInfo, + switchInfos map[string]*constant.SwitchInfo, + deviceCmForNodeMap map[string]*constant.AdvanceDeviceFaultCm, jobId string) ([]constant.FaultRank, []string) { faultList := make([]constant.FaultRank, 0) nodeStatusList := make([]string, 0) for nodeName, server := range serverList { hwlog.RunLog.Debugf("nodeName: %s, server: %#v", nodeName, server) - switchInfo, ok := switchInfos[constant.SwitchInfoPrefix+nodeName].(*constant.SwitchInfo) + switchInfo, ok := switchInfos[constant.SwitchInfoPrefix+nodeName] if ok { nodeStatusList = append(nodeStatusList, switchInfo.NodeStatus) } @@ -199,7 +199,7 @@ func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault(serverL faultList = append(faultList, serverHcclToFaultRank(server, jobId)...) continue } - nodeInfo, ok := nodeInfos[constant.NodeInfoPrefix+nodeName].(*constant.NodeInfo) + nodeInfo, ok := nodeInfos[constant.NodeInfoPrefix+nodeName] if ok && nodeInfo.NodeStatus == constant.UnHealthyState { hwlog.RunLog.Debugf("node %s is unhealthy", nodeName) faultList = append(faultList, serverHcclToFaultRank(server, jobId)...) @@ -211,7 +211,7 @@ func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault(serverL faultList = append(faultList, serverHcclToFaultRank(server, jobId)...) continue } - advanceDeviceInfo := deviceCmForNodeMap[nodeName].(*constant.AdvanceDeviceFaultCm) + advanceDeviceInfo := deviceCmForNodeMap[nodeName] faultRankList := processor.findFaultRankForJob(advanceDeviceInfo, nodeName, serverList, jobId) faultList = append(faultList, faultRankList...) } diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go index 704820a33..8546f984c 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go @@ -329,9 +329,9 @@ func testInvalidInfoType(processor *relationFaultProcessor) { func testValidInfoType(processor *relationFaultProcessor) { convey.Convey("When info type is valid", func() { content := constant.AllConfigmapContent{ - DeviceCm: map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo]{}, - SwitchCm: map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo]{}, - NodeCm: map[string]constant.AdvanceFaultConfigmap[*constant.NodeInfo]{}, + DeviceCm: map[string]*constant.AdvanceDeviceFaultCm{}, + SwitchCm: map[string]*constant.SwitchInfo{}, + NodeCm: map[string]*constant.NodeInfo{}, } patches := gomonkey.ApplyMethod(processor, "InitFaultJobs", func(_ *relationFaultProcessor) {}) @@ -724,7 +724,7 @@ func testValidConfig() { func TestInitFaultJobs(t *testing.T) { convey.Convey("Test InitFaultJobs", t, func() { processor := &relationFaultProcessor{ - switchInfoCm: map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo]{}, + switchInfoCm: map[string]*constant.SwitchInfo{}, faultJobs: make(map[string]*FaultJob), } @@ -759,8 +759,7 @@ func testInitFaultJob(processor *relationFaultProcessor) { convey.Convey("When initializing fault job", func() { patches := gomonkey.NewPatches() defer patches.Reset() - deviceInfo := map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo]{ - "node1": &constant.AdvanceDeviceFaultCm{SuperPodID: 1}} + deviceInfo := map[string]*constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} processor.deviceInfoCm = deviceInfo patches.ApplyFunc(job.GetJobServerInfoMap, func() constant.JobServerInfoMap { diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go index 0fdc4e218..d34eff099 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go @@ -45,9 +45,9 @@ func loadConfig() { type relationFaultProcessor struct { faultJobs map[string]*FaultJob - deviceInfoCm map[string]constant.AdvanceFaultConfigmap[*constant.DeviceInfo] - switchInfoCm map[string]constant.AdvanceFaultConfigmap[*constant.SwitchInfo] - nodeInfoCm map[string]constant.AdvanceFaultConfigmap[*constant.NodeInfo] + deviceInfoCm map[string]*constant.AdvanceDeviceFaultCm + switchInfoCm map[string]*constant.SwitchInfo + nodeInfoCm map[string]*constant.NodeInfo } // Process job network relation fault info @@ -86,11 +86,11 @@ func (processor *relationFaultProcessor) InitFaultJobs() { for nodeName, serverList := range serverLists { tmpFaultJob.PodNames[serverList.ServerName] = serverList.PodID tmpFaultJob.NameSpace = serverList.PodNameSpace - switchInfo, ok := (processor.switchInfoCm[constant.SwitchInfoPrefix+nodeName]).(*constant.SwitchInfo) + switchInfo, ok := processor.switchInfoCm[constant.SwitchInfoPrefix+nodeName] if ok { tmpFaultJob.initBySwitchFault(switchInfo, serverList) } - deviceInfo, ok := (processor.deviceInfoCm[nodeName]).(*constant.AdvanceDeviceFaultCm) + deviceInfo, ok := processor.deviceInfoCm[nodeName] if ok { tmpFaultJob.IsA3Job = deviceInfo.SuperPodID >= 0 tmpFaultJob.initByDeviceFault(deviceInfo, serverList) diff --git a/component/clusterd/pkg/application/resource/report.go b/component/clusterd/pkg/application/resource/report.go index 9c2d1317b..a1db2bb66 100644 --- a/component/clusterd/pkg/application/resource/report.go +++ b/component/clusterd/pkg/application/resource/report.go @@ -18,6 +18,7 @@ import ( "clusterd/pkg/application/faultmanager" "clusterd/pkg/common/constant" "clusterd/pkg/domain/device" + "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/node" "clusterd/pkg/domain/switchinfo" "clusterd/pkg/interface/kube" @@ -65,7 +66,8 @@ func Report(ctx context.Context) { }) switch whichToReport { case constant.DeviceProcessType: - deviceArr := device.GetSafeData(faultmanager.QueryDeviceInfoToReport()) + deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm( + faultmanager.QueryDeviceInfoToReport(), make(map[string]*constant.DeviceInfo))) updateDeviceInfoCm(deviceArr) case constant.NodeProcessType: nodeArr := node.GetSafeData(faultmanager.QueryNodeInfoToReport()) @@ -74,7 +76,8 @@ func Report(ctx context.Context) { switchArr := switchinfo.GetSafeData(faultmanager.QuerySwitchInfoToReport()) updateSwitchInfoCm(switchArr) case constant.AllProcessType: - deviceArr := device.GetSafeData(faultmanager.QueryDeviceInfoToReport()) + deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm( + faultmanager.QueryDeviceInfoToReport(), make(map[string]*constant.DeviceInfo))) nodeArr := node.GetSafeData(faultmanager.QueryNodeInfoToReport()) switchArr := switchinfo.GetSafeData(faultmanager.QuerySwitchInfoToReport()) updateAllCm(deviceArr, nodeArr, switchArr) diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index da18ed180..665a2a28a 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -4,6 +4,8 @@ package constant import ( + "reflect" + "ascend-common/api" "ascend-common/common-utils/hwlog" ) @@ -228,6 +230,10 @@ type AdvanceDeviceFaultCm struct { UpdateTime int64 } +func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { + return reflect.DeepEqual(cm, another.(*AdvanceDeviceFaultCm)) +} + // GetCmName return cm name func (cm *AdvanceDeviceFaultCm) GetCmName() string { return cm.CmName @@ -261,15 +267,15 @@ type InformerCmItem[T ConfigMapInterface] struct { // OneConfigmapContent contains one kind of configmap content type OneConfigmapContent[T ConfigMapInterface] struct { - AllConfigmap map[string]AdvanceFaultConfigmap[T] + AllConfigmap map[string]T UpdateConfigmap []InformerCmItem[T] } // AllConfigmapContent contains all kind of configmap content type AllConfigmapContent struct { - DeviceCm map[string]AdvanceFaultConfigmap[*DeviceInfo] - SwitchCm map[string]AdvanceFaultConfigmap[*SwitchInfo] - NodeCm map[string]AdvanceFaultConfigmap[*NodeInfo] + DeviceCm map[string]*AdvanceDeviceFaultCm + SwitchCm map[string]*SwitchInfo + NodeCm map[string]*NodeInfo } // ConfigMapInterface configmap interface @@ -492,9 +498,3 @@ type FaultNum struct { NodeFaultNum int `json:"-"` PubFaultNum int `json:"publicFaultNum"` } - -// AdvanceFaultConfigmap more usable faultConfigmap -type AdvanceFaultConfigmap[T ConfigMapInterface] interface { - // GetCmName return cm name - GetCmName() string -} diff --git a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go index cc1f02e3c..795a71293 100644 --- a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go +++ b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go @@ -16,39 +16,35 @@ type ConfigMap[T constant.ConfigMapInterface] struct { Data map[string]T } -var DeviceCenterCmManager *FaultCenterCmManager[*constant.DeviceInfo] +var DeviceCenterCmManager *FaultCenterCmManager[*constant.AdvanceDeviceFaultCm] var SwitchCenterCmManager *FaultCenterCmManager[*constant.SwitchInfo] var NodeCenterCmManager *FaultCenterCmManager[*constant.NodeInfo] type FaultCenterCmManager[T constant.ConfigMapInterface] struct { - mutex sync.RWMutex - cmBuffer *collector.ConfigmapCollectBuffer[T] - originalCm ConfigMap[T] - processingCm ConfigMap[T] - processedCm ConfigMap[T] + mutex sync.RWMutex + cmBuffer *collector.ConfigmapCollectBuffer[T] + originalCm ConfigMap[T] + processedCm ConfigMap[T] } func init() { - DeviceCenterCmManager = &FaultCenterCmManager[*constant.DeviceInfo]{ - mutex: sync.RWMutex{}, - originalCm: ConfigMap[*constant.DeviceInfo]{Data: make(map[string]*constant.DeviceInfo)}, - processingCm: ConfigMap[*constant.DeviceInfo]{Data: make(map[string]*constant.DeviceInfo)}, - processedCm: ConfigMap[*constant.DeviceInfo]{Data: make(map[string]*constant.DeviceInfo)}, - cmBuffer: collector.DeviceCmCollectBuffer, + DeviceCenterCmManager = &FaultCenterCmManager[*constant.AdvanceDeviceFaultCm]{ + mutex: sync.RWMutex{}, + originalCm: ConfigMap[*constant.AdvanceDeviceFaultCm]{Data: make(map[string]*constant.AdvanceDeviceFaultCm)}, + processedCm: ConfigMap[*constant.AdvanceDeviceFaultCm]{Data: make(map[string]*constant.AdvanceDeviceFaultCm)}, + cmBuffer: collector.DeviceCmCollectBuffer, } SwitchCenterCmManager = &FaultCenterCmManager[*constant.SwitchInfo]{ - mutex: sync.RWMutex{}, - originalCm: ConfigMap[*constant.SwitchInfo]{Data: make(map[string]*constant.SwitchInfo)}, - processingCm: ConfigMap[*constant.SwitchInfo]{Data: make(map[string]*constant.SwitchInfo)}, - processedCm: ConfigMap[*constant.SwitchInfo]{Data: make(map[string]*constant.SwitchInfo)}, - cmBuffer: collector.SwitchCmCollectBuffer, + mutex: sync.RWMutex{}, + originalCm: ConfigMap[*constant.SwitchInfo]{Data: make(map[string]*constant.SwitchInfo)}, + processedCm: ConfigMap[*constant.SwitchInfo]{Data: make(map[string]*constant.SwitchInfo)}, + cmBuffer: collector.SwitchCmCollectBuffer, } NodeCenterCmManager = &FaultCenterCmManager[*constant.NodeInfo]{ - mutex: sync.RWMutex{}, - originalCm: ConfigMap[*constant.NodeInfo]{Data: make(map[string]*constant.NodeInfo)}, - processingCm: ConfigMap[*constant.NodeInfo]{Data: make(map[string]*constant.NodeInfo)}, - processedCm: ConfigMap[*constant.NodeInfo]{Data: make(map[string]*constant.NodeInfo)}, - cmBuffer: collector.NodeCmCollectBuffer, + mutex: sync.RWMutex{}, + originalCm: ConfigMap[*constant.NodeInfo]{Data: make(map[string]*constant.NodeInfo)}, + processedCm: ConfigMap[*constant.NodeInfo]{Data: make(map[string]*constant.NodeInfo)}, + cmBuffer: collector.NodeCmCollectBuffer, } } @@ -58,18 +54,6 @@ func (manager *FaultCenterCmManager[T]) GetOriginalCm() ConfigMap[T] { return manager.originalCm.deepCopy() } -func (manager *FaultCenterCmManager[T]) SetProcessingCm(cm ConfigMap[T]) { - manager.mutex.Lock() - defer manager.mutex.Unlock() - manager.processingCm = cm.deepCopy() -} - -func (manager *FaultCenterCmManager[T]) GetProcessingCm() ConfigMap[T] { - manager.mutex.RLock() - defer manager.mutex.RUnlock() - return manager.processingCm.deepCopy() -} - func (manager *FaultCenterCmManager[T]) SetProcessedCm(cm ConfigMap[T]) bool { manager.mutex.Lock() defer manager.mutex.Unlock() diff --git a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go index a6cd5cbe4..4977c35db 100644 --- a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go +++ b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go @@ -35,24 +35,16 @@ func TestFaultCenterCmManagerSetAndGetConfigmap(t *testing.T) { t.Errorf("TestFaultCenterCmManagerSetAndGetDeviceInfoCm failed") } faultManager := FaultCenterCmManager[*constant.DeviceInfo]{ - mutex: sync.RWMutex{}, - originalCm: ConfigMap[*constant.DeviceInfo]{}, - processingCm: ConfigMap[*constant.DeviceInfo]{}, - processedCm: ConfigMap[*constant.DeviceInfo]{}, + mutex: sync.RWMutex{}, + originalCm: ConfigMap[*constant.DeviceInfo]{}, + processedCm: ConfigMap[*constant.DeviceInfo]{}, } faultManager.updateOriginalCm(cm1, true) faultManager.updateOriginalCm(cm2, true) if !reflect.DeepEqual(deviceCM, faultManager.GetOriginalCm()) { t.Errorf("TestFaultCenterCmManagerSetAndGetDeviceInfoCm failed") } - faultManager.SetProcessingCm(faultManager.GetOriginalCm()) - if !reflect.DeepEqual(deviceCM, faultManager.GetProcessingCm()) { - t.Errorf("TestFaultCenterCmManagerSetAndGetDeviceInfoCm failed") - } - faultManager.SetProcessedCm(faultManager.GetProcessingCm()) - if !reflect.DeepEqual(deviceCM, faultManager.GetProcessedCm()) { - t.Errorf("TestFaultCenterCmManagerSetAndGetDeviceInfoCm failed") - } + faultManager.updateOriginalCm(cm1, false) faultManager.updateOriginalCm(cm2, false) if len(faultManager.GetOriginalCm().Data) != 0 { diff --git a/component/clusterd/pkg/domain/faultdomain/collector/cm_collector.go b/component/clusterd/pkg/domain/faultdomain/collector/cm_collector.go index 7ffe01740..8ebe5f4db 100644 --- a/component/clusterd/pkg/domain/faultdomain/collector/cm_collector.go +++ b/component/clusterd/pkg/domain/faultdomain/collector/cm_collector.go @@ -8,9 +8,10 @@ import ( "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" + "clusterd/pkg/domain/faultdomain" ) -var DeviceCmCollectBuffer *ConfigmapCollectBuffer[*constant.DeviceInfo] +var DeviceCmCollectBuffer *ConfigmapCollectBuffer[*constant.AdvanceDeviceFaultCm] var NodeCmCollectBuffer *ConfigmapCollectBuffer[*constant.NodeInfo] var SwitchCmCollectBuffer *ConfigmapCollectBuffer[*constant.SwitchInfo] @@ -21,10 +22,10 @@ type ConfigmapCollectBuffer[T constant.ConfigMapInterface] struct { } func init() { - DeviceCmCollectBuffer = &ConfigmapCollectBuffer[*constant.DeviceInfo]{ + DeviceCmCollectBuffer = &ConfigmapCollectBuffer[*constant.AdvanceDeviceFaultCm]{ mutex: sync.Mutex{}, - buffer: make(map[string]*[]constant.InformerCmItem[*constant.DeviceInfo]), - lastItem: make(map[string]constant.InformerCmItem[*constant.DeviceInfo]), + buffer: make(map[string]*[]constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]), + lastItem: make(map[string]constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]), } NodeCmCollectBuffer = &ConfigmapCollectBuffer[*constant.NodeInfo]{ mutex: sync.Mutex{}, @@ -86,7 +87,10 @@ func informerItemEqual[T constant.ConfigMapInterface](lastItem, newItem constant func informInfoUpdate(newInfo any, whichToInformer int, isAdd bool) { switch whichToInformer { case constant.DeviceProcessType: - DeviceCmCollectBuffer.Push(newInfo.(*constant.DeviceInfo), isAdd) + advanceFaultForNode := + faultdomain.GetAdvanceFaultForNode(newInfo.(*constant.DeviceInfo)).(*constant.AdvanceDeviceFaultCm) + faultdomain.SortDataForAdvanceDeviceInfo(advanceFaultForNode) + DeviceCmCollectBuffer.Push(advanceFaultForNode, isAdd) case constant.NodeProcessType: NodeCmCollectBuffer.Push(newInfo.(*constant.NodeInfo), isAdd) case constant.SwitchProcessType: diff --git a/component/clusterd/pkg/domain/faultdomain/collector/cm_collector_test.go b/component/clusterd/pkg/domain/faultdomain/collector/cm_collector_test.go index 17bf8d7d2..2824d1e97 100644 --- a/component/clusterd/pkg/domain/faultdomain/collector/cm_collector_test.go +++ b/component/clusterd/pkg/domain/faultdomain/collector/cm_collector_test.go @@ -17,10 +17,10 @@ const ( ) func resetDeviceCmCollector() { - DeviceCmCollectBuffer = &ConfigmapCollectBuffer[*constant.DeviceInfo]{ + DeviceCmCollectBuffer = &ConfigmapCollectBuffer[*constant.AdvanceDeviceFaultCm]{ mutex: sync.Mutex{}, - buffer: make(map[string]*[]constant.InformerCmItem[*constant.DeviceInfo]), - lastItem: make(map[string]constant.InformerCmItem[*constant.DeviceInfo]), + buffer: make(map[string]*[]constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]), + lastItem: make(map[string]constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]), } } func TestCmInfoCollector(t *testing.T) { diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 02452e5cb..e497c651f 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -7,12 +7,13 @@ import ( "encoding/json" "fmt" "reflect" + "slices" + "sort" "strings" "time" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/utils/strings/slices" "ascend-common/api" "ascend-common/common-utils/hwlog" @@ -53,17 +54,16 @@ func CmNameToNodeName(cmName string) string { } // GetAdvanceFaultCm return more usable fault cm -func GetAdvanceFaultCm[T constant.ConfigMapInterface]( - cmInfos map[string]T) map[string]constant.AdvanceFaultConfigmap[T] { - result := make(map[string]constant.AdvanceFaultConfigmap[T]) +func GetAdvanceFaultCm[T, U constant.ConfigMapInterface]( + cmInfos map[string]T, result map[string]U) map[string]U { for _, info := range cmInfos { - result[CmNameToNodeName(info.GetCmName())] = GetAdvanceFaultForNode(info) + result[CmNameToNodeName(info.GetCmName())] = GetAdvanceFaultForNode(info).(U) } return result } // GetAdvanceFaultForNode return more usable fault cm for one node -func GetAdvanceFaultForNode[T constant.ConfigMapInterface](cmForNode T) constant.AdvanceFaultConfigmap[T] { +func GetAdvanceFaultForNode[T constant.ConfigMapInterface](cmForNode T) constant.ConfigMapInterface { switch cm := any(cmForNode).(type) { case *constant.DeviceInfo: return GetAdvanceDeviceCm(cm) @@ -71,6 +71,8 @@ func GetAdvanceFaultForNode[T constant.ConfigMapInterface](cmForNode T) constant return cm case *constant.SwitchInfo: return cm + case *constant.AdvanceDeviceFaultCm: + return cm } hwlog.RunLog.Errorf("cmForNode type is not support.") return nil @@ -305,18 +307,17 @@ func AddFaultIntoFaultMap(faultMap map[string][]constant.DeviceFault, return faultMap } -func AdvanceFaultCmToOriginalFaultCm[T constant.ConfigMapInterface]( - advanceFaultCm map[string]constant.AdvanceFaultConfigmap[T]) map[string]T { - result := make(map[string]T) +func AdvanceFaultCmToOriginalFaultCm[T, U constant.ConfigMapInterface]( + advanceFaultCm map[string]T, orgFaultCm map[string]U) map[string]U { for _, advanceCmForNode := range advanceFaultCm { - result[advanceCmForNode.GetCmName()] = AdvanceFaultCmToOriginalCmForNode(advanceCmForNode).(T) + orgFaultCm[advanceCmForNode.GetCmName()] = AdvanceFaultCmToOriginalCmForNode(advanceCmForNode).(U) } - return result + return orgFaultCm } func AdvanceFaultCmToOriginalCmForNode[T constant.ConfigMapInterface]( - advanceCmForNode constant.AdvanceFaultConfigmap[T]) constant.ConfigMapInterface { - switch cm := advanceCmForNode.(type) { + advanceCmForNode T) constant.ConfigMapInterface { + switch cm := any(advanceCmForNode).(type) { case *constant.AdvanceDeviceFaultCm: return AdvanceDeviceFaultCmToOriginalCmForNode(cm) case *constant.SwitchInfo: @@ -593,3 +594,63 @@ func ValidBusinessRecoverTime(recoverTime int64) bool { func ValidBusinessUceReportInfo(info *constant.ReportInfo) bool { return ValidBusinessRecoverTime(info.RecoverTime) } + +func SortDataForAdvanceDeviceInfo(deviceInfo *constant.AdvanceDeviceFaultCm) { + sort.Strings(deviceInfo.AvailableDevices) + sort.Strings(deviceInfo.CardUnHealthy) + sort.Strings(deviceInfo.NetworkUnhealthy) + sort.Strings(deviceInfo.Recovering) + for _, faultList := range deviceInfo.FaultDeviceList { + slices.SortFunc(faultList, compareDeviceFault) + } +} + +func compareDeviceFault(a, b constant.DeviceFault) int { + if res := strings.Compare(a.FaultType, b.FaultType); res != 0 { + return res + } + if res := strings.Compare(a.NPUName, b.NPUName); res != 0 { + return res + } + if res := strings.Compare(a.LargeModelFaultLevel, b.LargeModelFaultLevel); res != 0 { + return res + } + if res := strings.Compare(a.FaultLevel, b.FaultLevel); res != 0 { + return res + } + if res := strings.Compare(a.FaultHandling, b.FaultHandling); res != 0 { + return res + } + if res := strings.Compare(a.FaultCode, b.FaultCode); res != 0 { + return res + } + keysA := getSortedKeys(a.FaultTimeAndLevelMap) + keysB := getSortedKeys(b.FaultTimeAndLevelMap) + for i := 0; i < len(keysA); i++ { + if cmp := strings.Compare(keysA[i], keysB[i]); cmp != 0 { + return cmp + } + valA := a.FaultTimeAndLevelMap[keysA[i]] + valB := b.FaultTimeAndLevelMap[keysB[i]] + if cmp := compareFaultTimeAndLevel(valA, valB); cmp != 0 { + return cmp + } + } + return 0 +} + +func compareFaultTimeAndLevel(a, b constant.FaultTimeAndLevel) int { + if res := a.FaultTime - b.FaultTime; res != 0 { + return int(res) + } + return strings.Compare(a.FaultLevel, b.FaultLevel) +} + +func getSortedKeys[T any](m map[string]T) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} -- Gitee From a6e9e97a3c607e0bcb3cc417487ae29baacaf7bf Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Sat, 19 Apr 2025 00:52:10 +0800 Subject: [PATCH 07/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20IsSame=20bug=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cmprocess/uce/uce_fault_processor_test.go | 13 +++++++------ .../uce_accompany_fault_processor_test.go | 4 ++-- .../relationfault/relation_fault_process_test.go | 8 +++----- component/clusterd/pkg/common/constant/type.go | 11 ++++++++++- .../clusterd/pkg/domain/faultdomain/fault_utils.go | 12 +----------- .../pkg/domain/faultdomain/fault_utils_test.go | 2 +- 6 files changed, 24 insertions(+), 26 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go index 50d9753f1..d5351af68 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go @@ -289,7 +289,7 @@ func TestUceFaultProcessorGetUceDeviceOfNodes(t *testing.T) { t.Errorf("init data failed. %v", testFileErr) } - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) deviceOfNodes := UceProcessor.getUceDeviceOfNodes() if !reflect.DeepEqual(deviceOfNodes, uceNodesInfos) { t.Errorf("getUceDeviceOfNodes() = %v, want %v", @@ -306,7 +306,7 @@ func TestUceFaultProcessorGetUceDevicesForUceTolerateJobs(t *testing.T) { } UceProcessor.jobServerInfoMap = jobServerInfoMap - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) UceProcessor.uceDeviceOfNode = UceProcessor.getUceDeviceOfNodes() UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() if !reflect.DeepEqual(UceProcessor.uceDevicesOfUceJob, expectUceJobsInfo) { @@ -324,13 +324,14 @@ func TestUceFaultProcessorProcessUceFaultInfo(t *testing.T) { } UceProcessor.jobServerInfoMap = jobServerInfoMap - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) UceProcessor.uceDeviceOfNode = UceProcessor.getUceDeviceOfNodes() UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() currentTime := 109 * time.Second.Milliseconds() UceProcessor.processUceFaultInfo(currentTime) result := UceProcessor.nodeDeviceCmMap - want := faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos) + want := faultdomain.GetAdvanceFaultCm( + expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) if !reflect.DeepEqual(result, want) { t.Errorf("result:\n%v\n\nwant:\n%v", util.ObjToString(result), util.ObjToString(want)) @@ -348,13 +349,13 @@ func TestUceFaultProcessorScenario1(t *testing.T) { collector.ReportInfoCollector = reportInfos UceProcessor.jobServerInfoMap = jobServerInfoMap - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) UceProcessor.uceDeviceOfNode = UceProcessor.getUceDeviceOfNodes() UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() currentTime := 100 * time.Second.Milliseconds() UceProcessor.processUceFaultInfo(currentTime) result := UceProcessor.nodeDeviceCmMap - want := faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos) + want := faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) if !reflect.DeepEqual(result, want) { t.Errorf("processUceFaultInfo() = %v, \n\nwant %v", util.ObjToString(result), util.ObjToString(want)) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go index aed74dc59..b36b380bf 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go @@ -36,11 +36,11 @@ func TestUceAccompanyFaultProcessorProcess(t *testing.T) { if testFileErr != nil { t.Errorf("init data failed. %v", testFileErr) } - UceAccompanyProcessor.deviceCmForNodeMap = faultdomain.GetAdvanceDeviceCmForNodeMap(cmDeviceInfos) + UceAccompanyProcessor.deviceCmForNodeMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) UceAccompanyProcessor.uceAccompanyFaultInQue() UceAccompanyProcessor.filterFaultInfos(CurrentTime) if !reflect.DeepEqual(UceAccompanyProcessor.deviceCmForNodeMap, - faultdomain.GetAdvanceDeviceCmForNodeMap(expectProcessedDeviceInfos)) { + faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm))) { t.Errorf("result = %v, want %v", util.ObjToString(cmDeviceInfos), util.ObjToString(expectProcessedDeviceInfos)) } diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go index 8546f984c..3d06b5767 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go @@ -16,7 +16,6 @@ import ( "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" "clusterd/pkg/common/util" - "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/job" "clusterd/pkg/interface/kube" ) @@ -735,11 +734,10 @@ func TestInitFaultJobs(t *testing.T) { func testEmptyServerList(processor *relationFaultProcessor) { convey.Convey("When server list is empty", func() { - patches := gomonkey.ApplyFunc(faultdomain.GetAdvanceDeviceCmForNodeMap, - func(deviceInfoCm map[string]*constant.DeviceInfo) map[string]*constant.AdvanceDeviceFaultCm { - return map[string]*constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} - }) + patches := gomonkey.NewPatches() defer patches.Reset() + deviceInfo := map[string]*constant.AdvanceDeviceFaultCm{"node1": {SuperPodID: 1}} + processor.deviceInfoCm = deviceInfo patches.ApplyFunc(job.GetJobServerInfoMap, func() constant.JobServerInfoMap { return constant.JobServerInfoMap{ diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index 665a2a28a..818059240 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -230,8 +230,17 @@ type AdvanceDeviceFaultCm struct { UpdateTime int64 } +// IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { - return reflect.DeepEqual(cm, another.(*AdvanceDeviceFaultCm)) + thisUpdateTime := cm.UpdateTime + thatCm := another.(*AdvanceDeviceFaultCm) + thatUpdateTime := thatCm.UpdateTime + cm.UpdateTime = 0 + thatCm.UpdateTime = 0 + isSame := reflect.DeepEqual(cm, thatCm) + cm.UpdateTime = thisUpdateTime + thatCm.UpdateTime = thatUpdateTime + return isSame } // GetCmName return cm name diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index e497c651f..a56247487 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -53,7 +53,7 @@ func CmNameToNodeName(cmName string) string { return strings.TrimPrefix(cmName, constant.DeviceInfoPrefix) } -// GetAdvanceFaultCm return more usable fault cm +// GetAdvanceFaultCm return more usable fault cm, ONLY FOR TESTCASE func GetAdvanceFaultCm[T, U constant.ConfigMapInterface]( cmInfos map[string]T, result map[string]U) map[string]U { for _, info := range cmInfos { @@ -78,16 +78,6 @@ func GetAdvanceFaultForNode[T constant.ConfigMapInterface](cmForNode T) constant return nil } -// GetAdvanceDeviceCmForNodeMap get advance device cm for node map -func GetAdvanceDeviceCmForNodeMap( - deviceInfoCms map[string]*constant.DeviceInfo) map[string]*constant.AdvanceDeviceFaultCm { - advanceDeviceCmForNodeMap := make(map[string]*constant.AdvanceDeviceFaultCm) - for _, deviceInfo := range deviceInfoCms { - advanceDeviceCmForNodeMap[CmNameToNodeName(deviceInfo.CmName)] = GetAdvanceDeviceCm(deviceInfo) - } - return advanceDeviceCmForNodeMap -} - // GetAdvanceDeviceCm return more usable device cm func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFaultCm { advanceDeviceCm := &constant.AdvanceDeviceFaultCm{ diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index ae8658464..4290e31a1 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -446,7 +446,7 @@ func TestGetAdvanceDeviceCmForNodeMap(t *testing.T) { cmName: originalDeviceCm, } t.Run("TestGetAdvanceDeviceConfigmap", func(t *testing.T) { - got := GetAdvanceDeviceCmForNodeMap(deviceInfoCms) + got := GetAdvanceFaultCm(deviceInfoCms, make(map[string]*constant.AdvanceDeviceFaultCm)) if len(got[nodeName].FaultDeviceList[deviceName]) != originalDeviceFaultCodeCnt { t.Error("TestGetAdvanceDeviceConfigmap fail") } -- Gitee From d3eed1e14a2f25cfe9c96afd0a58bcfb22be4a3d Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Tue, 22 Apr 2025 15:46:11 +0800 Subject: [PATCH 08/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20=E7=B1=BB=E5=9E=8B=E6=8E=A8?= =?UTF-8?q?=E5=88=B0=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../publicfault/pub_fault_processor_test.go | 14 +++++++------- .../cmprocess/uce/uce_fault_processor_test.go | 17 ++++++++--------- .../uce_accompany_fault_processor_test.go | 8 ++++---- .../clusterd/pkg/application/resource/report.go | 8 ++++---- .../pkg/domain/faultdomain/fault_utils.go | 10 ++++++---- .../pkg/domain/faultdomain/fault_utils_test.go | 2 +- 6 files changed, 30 insertions(+), 29 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go index 3d35dceb0..d15895fba 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go @@ -53,7 +53,7 @@ func TestProcessor(t *testing.T) { func testNilCache() { resetFaultCache() ori := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)), + AllConfigmap: faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](oriDevInfo1), UpdateConfigmap: nil, } res := PubFaultProcessor.Process(ori) @@ -71,7 +71,7 @@ func testNodeNameInvalid() { resetFaultCache() publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName3, faultKey1) content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)), + AllConfigmap: faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](oriDevInfo1), UpdateConfigmap: nil, } exp := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) @@ -82,12 +82,12 @@ func testDiff() { resetFaultCache() publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName1, faultKey1) content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)), + AllConfigmap: faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](oriDevInfo1), UpdateConfigmap: nil, } resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) sortDeviceFaultList(resContent.AllConfigmap) - want := faultdomain.GetAdvanceFaultCm(expDeviceInfo1, make(map[string]*constant.AdvanceDeviceFaultCm)) + want := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expDeviceInfo1) sortDeviceFaultList(want) result := resContent.AllConfigmap convey.So(result, convey.ShouldResemble, want) @@ -99,16 +99,16 @@ func testCommon() { testCacheData.FaultDevIds = []int32{0, card5} publicfault.PubFaultCache.AddPubFaultToCache(&testCacheData, testNodeName2, faultKey2) content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(oriDevInfo2, make(map[string]*constant.AdvanceDeviceFaultCm)), + AllConfigmap: faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](oriDevInfo2), UpdateConfigmap: nil, } resContent := PubFaultProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) hwlog.RunLog.Infof(util.ObjToString(resContent.AllConfigmap)) hwlog.RunLog.Infof( - util.ObjToString(faultdomain.GetAdvanceFaultCm(expDeviceInfo2, make(map[string]*constant.AdvanceDeviceFaultCm)))) + util.ObjToString(faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expDeviceInfo2))) sortDeviceFaultList(resContent.AllConfigmap) result := resContent.AllConfigmap - want := faultdomain.GetAdvanceFaultCm(expDeviceInfo2, make(map[string]*constant.AdvanceDeviceFaultCm)) + want := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expDeviceInfo2) sortDeviceFaultList(want) convey.So(result, convey.ShouldResemble, want) } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go index d5351af68..07dfebca0 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go @@ -289,7 +289,7 @@ func TestUceFaultProcessorGetUceDeviceOfNodes(t *testing.T) { t.Errorf("init data failed. %v", testFileErr) } - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos) deviceOfNodes := UceProcessor.getUceDeviceOfNodes() if !reflect.DeepEqual(deviceOfNodes, uceNodesInfos) { t.Errorf("getUceDeviceOfNodes() = %v, want %v", @@ -306,7 +306,7 @@ func TestUceFaultProcessorGetUceDevicesForUceTolerateJobs(t *testing.T) { } UceProcessor.jobServerInfoMap = jobServerInfoMap - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos) UceProcessor.uceDeviceOfNode = UceProcessor.getUceDeviceOfNodes() UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() if !reflect.DeepEqual(UceProcessor.uceDevicesOfUceJob, expectUceJobsInfo) { @@ -324,14 +324,13 @@ func TestUceFaultProcessorProcessUceFaultInfo(t *testing.T) { } UceProcessor.jobServerInfoMap = jobServerInfoMap - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos) UceProcessor.uceDeviceOfNode = UceProcessor.getUceDeviceOfNodes() UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() currentTime := 109 * time.Second.Milliseconds() UceProcessor.processUceFaultInfo(currentTime) result := UceProcessor.nodeDeviceCmMap - want := faultdomain.GetAdvanceFaultCm( - expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + want := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expectProcessedDeviceInfos) if !reflect.DeepEqual(result, want) { t.Errorf("result:\n%v\n\nwant:\n%v", util.ObjToString(result), util.ObjToString(want)) @@ -349,13 +348,13 @@ func TestUceFaultProcessorScenario1(t *testing.T) { collector.ReportInfoCollector = reportInfos UceProcessor.jobServerInfoMap = jobServerInfoMap - UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + UceProcessor.nodeDeviceCmMap = faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos) UceProcessor.uceDeviceOfNode = UceProcessor.getUceDeviceOfNodes() UceProcessor.uceDevicesOfUceJob = UceProcessor.getUceDevicesForUceTolerateJobs() currentTime := 100 * time.Second.Milliseconds() UceProcessor.processUceFaultInfo(currentTime) result := UceProcessor.nodeDeviceCmMap - want := faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + want := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expectProcessedDeviceInfos) if !reflect.DeepEqual(result, want) { t.Errorf("processUceFaultInfo() = %v, \n\nwant %v", util.ObjToString(result), util.ObjToString(want)) @@ -371,7 +370,7 @@ func TestUceFaultProcessorScenario2(t *testing.T) { t.Errorf("init data failed. %v", testFileErr) } content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)), + AllConfigmap: faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos), UpdateConfigmap: []constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]{ { IsAdd: false, @@ -397,7 +396,7 @@ func TestUceFaultProcessorScenario2(t *testing.T) { resultContent := UceProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) result := resultContent.AllConfigmap - want := faultdomain.GetAdvanceFaultCm(expProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + want := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expProcessedDeviceInfos) if !reflect.DeepEqual(result, want) { t.Errorf("processUceFaultInfo() = %v, want %v", util.ObjToString(result), util.ObjToString(want)) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go index b36b380bf..4d2a927d6 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go @@ -36,11 +36,11 @@ func TestUceAccompanyFaultProcessorProcess(t *testing.T) { if testFileErr != nil { t.Errorf("init data failed. %v", testFileErr) } - UceAccompanyProcessor.deviceCmForNodeMap = faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + UceAccompanyProcessor.deviceCmForNodeMap = faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos) UceAccompanyProcessor.uceAccompanyFaultInQue() UceAccompanyProcessor.filterFaultInfos(CurrentTime) if !reflect.DeepEqual(UceAccompanyProcessor.deviceCmForNodeMap, - faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm))) { + faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expectProcessedDeviceInfos)) { t.Errorf("result = %v, want %v", util.ObjToString(cmDeviceInfos), util.ObjToString(expectProcessedDeviceInfos)) } @@ -60,7 +60,7 @@ func TestUceAccompanyFaultProcessorProcessE2E(t *testing.T) { t.Errorf("init data failed. %v", testFileErr) } content := constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]{ - AllConfigmap: faultdomain.GetAdvanceFaultCm(cmDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)), + AllConfigmap: faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos), UpdateConfigmap: []constant.InformerCmItem[*constant.AdvanceDeviceFaultCm]{{}}, } mockTime := time.Time{} @@ -76,7 +76,7 @@ func TestUceAccompanyFaultProcessorProcessE2E(t *testing.T) { }() resultContent := UceAccompanyProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) - exp := faultdomain.GetAdvanceFaultCm(expectProcessedDeviceInfos, make(map[string]*constant.AdvanceDeviceFaultCm)) + exp := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expectProcessedDeviceInfos) if !reflect.DeepEqual(resultContent.AllConfigmap, exp) { t.Errorf("result:\n%v\nwant:\n%v", util.ObjToString(resultContent.AllConfigmap), diff --git a/component/clusterd/pkg/application/resource/report.go b/component/clusterd/pkg/application/resource/report.go index a1db2bb66..8dedc21b5 100644 --- a/component/clusterd/pkg/application/resource/report.go +++ b/component/clusterd/pkg/application/resource/report.go @@ -66,8 +66,8 @@ func Report(ctx context.Context) { }) switch whichToReport { case constant.DeviceProcessType: - deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm( - faultmanager.QueryDeviceInfoToReport(), make(map[string]*constant.DeviceInfo))) + deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm[*constant.DeviceInfo]( + faultmanager.QueryDeviceInfoToReport())) updateDeviceInfoCm(deviceArr) case constant.NodeProcessType: nodeArr := node.GetSafeData(faultmanager.QueryNodeInfoToReport()) @@ -76,8 +76,8 @@ func Report(ctx context.Context) { switchArr := switchinfo.GetSafeData(faultmanager.QuerySwitchInfoToReport()) updateSwitchInfoCm(switchArr) case constant.AllProcessType: - deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm( - faultmanager.QueryDeviceInfoToReport(), make(map[string]*constant.DeviceInfo))) + deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm[*constant.DeviceInfo]( + faultmanager.QueryDeviceInfoToReport())) nodeArr := node.GetSafeData(faultmanager.QueryNodeInfoToReport()) switchArr := switchinfo.GetSafeData(faultmanager.QuerySwitchInfoToReport()) updateAllCm(deviceArr, nodeArr, switchArr) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index a56247487..7359faca2 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -54,8 +54,9 @@ func CmNameToNodeName(cmName string) string { } // GetAdvanceFaultCm return more usable fault cm, ONLY FOR TESTCASE -func GetAdvanceFaultCm[T, U constant.ConfigMapInterface]( - cmInfos map[string]T, result map[string]U) map[string]U { +func GetAdvanceFaultCm[U, T constant.ConfigMapInterface]( + cmInfos map[string]T) map[string]U { + result := make(map[string]U) for _, info := range cmInfos { result[CmNameToNodeName(info.GetCmName())] = GetAdvanceFaultForNode(info).(U) } @@ -297,8 +298,9 @@ func AddFaultIntoFaultMap(faultMap map[string][]constant.DeviceFault, return faultMap } -func AdvanceFaultCmToOriginalFaultCm[T, U constant.ConfigMapInterface]( - advanceFaultCm map[string]T, orgFaultCm map[string]U) map[string]U { +func AdvanceFaultCmToOriginalFaultCm[U, T constant.ConfigMapInterface]( + advanceFaultCm map[string]T) map[string]U { + orgFaultCm := make(map[string]U) for _, advanceCmForNode := range advanceFaultCm { orgFaultCm[advanceCmForNode.GetCmName()] = AdvanceFaultCmToOriginalCmForNode(advanceCmForNode).(U) } diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index 4290e31a1..aaa7faa5f 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -446,7 +446,7 @@ func TestGetAdvanceDeviceCmForNodeMap(t *testing.T) { cmName: originalDeviceCm, } t.Run("TestGetAdvanceDeviceConfigmap", func(t *testing.T) { - got := GetAdvanceFaultCm(deviceInfoCms, make(map[string]*constant.AdvanceDeviceFaultCm)) + got := GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](deviceInfoCms) if len(got[nodeName].FaultDeviceList[deviceName]) != originalDeviceFaultCodeCnt { t.Error("TestGetAdvanceDeviceConfigmap fail") } -- Gitee From 867af7e5e5f6c8b12f05a42c860b4e5e8ee58721 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Tue, 22 Apr 2025 17:00:41 +0800 Subject: [PATCH 09/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20=E6=B5=8B=E8=AF=95=E7=94=A8?= =?UTF-8?q?=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cmmanager/configmap_manager_test.go | 70 +++++ .../pkg/domain/faultdomain/fault_utils.go | 17 +- .../domain/faultdomain/fault_utils_test.go | 268 ++++++++++++++++++ 3 files changed, 347 insertions(+), 8 deletions(-) diff --git a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go index 4977c35db..16c681d0f 100644 --- a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go +++ b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go @@ -4,6 +4,10 @@ package cmmanager import ( + "clusterd/pkg/common/util" + "clusterd/pkg/domain/faultdomain/collector" + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" "reflect" "sync" "testing" @@ -52,3 +56,69 @@ func TestFaultCenterCmManagerSetAndGetConfigmap(t *testing.T) { } }) } + +func TestConfigMapEqual(t *testing.T) { + patches := gomonkey.NewPatches() + defer patches.Reset() + + convey.Convey("Test ConfigMap.equal()", t, func() { + cm1 := ConfigMap[*constant.AdvanceDeviceFaultCm]{ + Data: map[string]*constant.AdvanceDeviceFaultCm{ + "node": { + FaultDeviceList: nil, + AvailableDevices: []string{"2", "1"}, + Recovering: []string{"4", "3"}, + CardUnHealthy: []string{"6", "5"}, + NetworkUnhealthy: []string{"6", "5"}, + UpdateTime: 10, + }, + }, + } + cm2 := new(ConfigMap[*constant.AdvanceDeviceFaultCm]) + util.DeepCopy(cm2, cm1) + + convey.Convey("should return true for equal config maps", func() { + convey.So(cm1.equal(*cm2), convey.ShouldBeTrue) + }) + }) +} + +func TestUpdateBatchOriginalCm(t *testing.T) { + patches := gomonkey.NewPatches() + defer patches.Reset() + + convey.Convey("Test UpdateBatchOriginalCm()", t, func() { + manager := FaultCenterCmManager[*constant.AdvanceDeviceFaultCm]{ + mutex: sync.RWMutex{}, + cmBuffer: &collector.ConfigmapCollectBuffer[*constant.AdvanceDeviceFaultCm]{}, + originalCm: ConfigMap[*constant.AdvanceDeviceFaultCm]{}, + processedCm: ConfigMap[*constant.AdvanceDeviceFaultCm]{}, + } + manager.UpdateBatchOriginalCm() + convey.So(manager.originalCm.Data, convey.ShouldBeEmpty) + }) +} + +func TestSetAndGetProcessedCm(t *testing.T) { + patches := gomonkey.NewPatches() + defer patches.Reset() + + convey.Convey("Test SetProcessedCm() and GetProcessedCm()", t, func() { + manager := FaultCenterCmManager[*constant.AdvanceDeviceFaultCm]{ + mutex: sync.RWMutex{}, + cmBuffer: &collector.ConfigmapCollectBuffer[*constant.AdvanceDeviceFaultCm]{}, + originalCm: ConfigMap[*constant.AdvanceDeviceFaultCm]{}, + processedCm: ConfigMap[*constant.AdvanceDeviceFaultCm]{}, + } + cm := ConfigMap[*constant.AdvanceDeviceFaultCm]{ + Data: map[string]*constant.AdvanceDeviceFaultCm{ + "node": {}, + }, + } + manager.SetProcessedCm(cm) + convey.So(manager.processedCm, convey.ShouldResemble, cm) + + processedCm := manager.GetProcessedCm() + convey.So(processedCm, convey.ShouldResemble, cm) + }) +} diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 7359faca2..28998d74a 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -298,8 +298,7 @@ func AddFaultIntoFaultMap(faultMap map[string][]constant.DeviceFault, return faultMap } -func AdvanceFaultCmToOriginalFaultCm[U, T constant.ConfigMapInterface]( - advanceFaultCm map[string]T) map[string]U { +func AdvanceFaultCmToOriginalFaultCm[U, T constant.ConfigMapInterface](advanceFaultCm map[string]T) map[string]U { orgFaultCm := make(map[string]U) for _, advanceCmForNode := range advanceFaultCm { orgFaultCm[advanceCmForNode.GetCmName()] = AdvanceFaultCmToOriginalCmForNode(advanceCmForNode).(U) @@ -307,8 +306,7 @@ func AdvanceFaultCmToOriginalFaultCm[U, T constant.ConfigMapInterface]( return orgFaultCm } -func AdvanceFaultCmToOriginalCmForNode[T constant.ConfigMapInterface]( - advanceCmForNode T) constant.ConfigMapInterface { +func AdvanceFaultCmToOriginalCmForNode[T constant.ConfigMapInterface](advanceCmForNode T) constant.ConfigMapInterface { switch cm := any(advanceCmForNode).(type) { case *constant.AdvanceDeviceFaultCm: return AdvanceDeviceFaultCmToOriginalCmForNode(cm) @@ -324,10 +322,13 @@ func AdvanceFaultCmToOriginalCmForNode[T constant.ConfigMapInterface]( // AdvanceDeviceFaultCmToOriginalCmForNode convert advance device cm to original format func AdvanceDeviceFaultCmToOriginalCmForNode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) *constant.DeviceInfo { orgDeviceCm := &constant.DeviceInfo{ - DeviceInfoNoName: constant.DeviceInfoNoName{}, - CmName: advanceDeviceCm.CmName, - SuperPodID: advanceDeviceCm.SuperPodID, - ServerIndex: advanceDeviceCm.ServerIndex, + DeviceInfoNoName: constant.DeviceInfoNoName{ + DeviceList: make(map[string]string), + UpdateTime: 0, + }, + CmName: advanceDeviceCm.CmName, + SuperPodID: advanceDeviceCm.SuperPodID, + ServerIndex: advanceDeviceCm.ServerIndex, } FixUnhealthyInfo(advanceDeviceCm) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index aaa7faa5f..e312d3117 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" "k8s.io/api/core/v1" "ascend-common/common-utils/hwlog" @@ -554,3 +555,270 @@ func TestIsFaultDeletable(t *testing.T) { } }) } + +// TestGetAdvanceFaultForNode test get advance fault info for node +func TestGetAdvanceFaultForNode(t *testing.T) { + patches := gomonkey.NewPatches() + defer patches.Reset() + + convey.Convey("Test GetAdvanceFaultForNode", t, func() { + convey.Convey("Case 1: DeviceInfo input", func() { + deviceInfo := &constant.DeviceInfo{} + expected := &constant.AdvanceDeviceFaultCm{} + + patches.ApplyFunc(GetAdvanceDeviceCm, func(*constant.DeviceInfo) *constant.AdvanceDeviceFaultCm { + return expected + }) + + result := GetAdvanceFaultForNode(deviceInfo) + convey.So(result, convey.ShouldEqual, expected) + }) + + convey.Convey("Case 2: NodeInfo input", func() { + nodeInfo := &constant.NodeInfo{} + result := GetAdvanceFaultForNode(nodeInfo) + convey.So(result, convey.ShouldEqual, nodeInfo) + }) + + convey.Convey("Case 3: SwitchInfo input", func() { + switchInfo := &constant.SwitchInfo{} + result := GetAdvanceFaultForNode(switchInfo) + convey.So(result, convey.ShouldEqual, switchInfo) + }) + + convey.Convey("Case 4: AdvanceDeviceFaultCm input", func() { + advanceCm := &constant.AdvanceDeviceFaultCm{} + result := GetAdvanceFaultForNode(advanceCm) + convey.So(result, convey.ShouldEqual, advanceCm) + }) + }) +} + +func TestAdvanceFaultCmToOriginalFaultCm(t *testing.T) { + convey.Convey("Test AdvanceFaultCmToOriginalFaultCm", t, func() { + node1 := "node1" + node2 := "node2" + mockAdvanceCm1 := &constant.AdvanceDeviceFaultCm{ + ServerType: "", + CmName: "CmName-" + node1, + SuperPodID: 0, + ServerIndex: 0, + FaultDeviceList: make(map[string][]constant.DeviceFault), + AvailableDevices: []string{"xxx"}, + Recovering: []string{"xxx"}, + CardUnHealthy: []string{"xxx"}, + NetworkUnhealthy: []string{"xxx"}, + UpdateTime: 0, + } + mockAdvanceCm2 := new(constant.AdvanceDeviceFaultCm) + util.DeepCopy(mockAdvanceCm1, mockAdvanceCm2) + mockAdvanceCm2.CmName = "CmName-" + node2 + + convey.Convey("should convert map correctly", func() { + input := map[string]constant.ConfigMapInterface{ + node1: mockAdvanceCm1, + node2: mockAdvanceCm2, + } + + result := AdvanceFaultCmToOriginalFaultCm[*constant.DeviceInfo](input) + + convey.So(len(result), convey.ShouldEqual, 2) + }) + }) +} + +func TestGetSortedKeys(t *testing.T) { + patches := gomonkey.NewPatches() + defer patches.Reset() + + convey.Convey("Test getSortedKeys", t, func() { + convey.Convey("should return sorted keys for string-struct map", func() { + type testStruct struct{ val int } + input := map[string]testStruct{ + "zebra": {1}, + "lion": {2}, + "ape": {3}, + } + expected := []string{"ape", "lion", "zebra"} + result := getSortedKeys(input) + convey.So(result, convey.ShouldResemble, expected) + }) + + convey.Convey("should handle empty map", func() { + input := map[string]float64{} + result := getSortedKeys(input) + convey.So(result, convey.ShouldBeEmpty) + }) + }) +} + +func TestCompareFaultTimeAndLevel(t *testing.T) { + convey.Convey("Test compareFaultTimeAndLevel", t, func() { + convey.Convey("should compare by FaultTime first", func() { + a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} + b := constant.FaultTimeAndLevel{FaultTime: 200, FaultLevel: "high"} + convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeLessThan, 0) + }) + + convey.Convey("should compare by FaultLevel when FaultTime equal", func() { + a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} + b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "high"} + convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeGreaterThan, 0) + }) + + convey.Convey("should return 0 when both equal", func() { + a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} + b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} + convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldEqual, 0) + }) + }) +} + +func TestCompareDeviceFault(t *testing.T) { + convey.Convey("Test compareDeviceFault", t, func() { + baseFault := constant.DeviceFault{ + FaultType: constant.CardUnhealthy, + NPUName: "npu0", + LargeModelFaultLevel: constant.SubHealthFault, + FaultLevel: constant.SubHealthFault, + FaultHandling: constant.SubHealthFault, + FaultCode: constant.AicFaultCode, + FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ + constant.AicFaultCode: { + FaultTime: 0, + FaultLevel: constant.SubHealthFault, + }, + }, + } + + convey.Convey("should compare by FaultType first", func() { + f1 := baseFault + f2 := baseFault + convey.So(compareDeviceFault(f1, f2), convey.ShouldEqual, 0) + }) + + convey.Convey("should compare by NPUName when FaultType equal", func() { + f1 := baseFault + f2 := baseFault + f2.NPUName = "npu1" + convey.So(compareDeviceFault(f1, f2), convey.ShouldBeLessThan, 0) + }) + + convey.Convey("should compare FaultTimeAndLevelMap when all fields equal", func() { + f1 := baseFault + f1.FaultTimeAndLevelMap = map[string]constant.FaultTimeAndLevel{ + "key1": {FaultTime: 100, FaultLevel: "low"}, + } + f2 := baseFault + f2.FaultTimeAndLevelMap = map[string]constant.FaultTimeAndLevel{ + "key1": {FaultTime: 200, FaultLevel: "low"}, + } + convey.So(compareDeviceFault(f1, f2), convey.ShouldBeLessThan, 0) + }) + }) +} + +func TestSortDataForAdvanceDeviceInfo(t *testing.T) { + convey.Convey("Test SortDataForAdvanceDeviceInfo", t, func() { + // Mock dependencies + + deviceInfo := &constant.AdvanceDeviceFaultCm{ + AvailableDevices: []string{"d3", "d1", "d2"}, + CardUnHealthy: []string{"c2", "c1"}, + NetworkUnhealthy: []string{"n2", "n1"}, + Recovering: []string{"r2", "r1"}, + FaultDeviceList: map[string][]constant.DeviceFault{ + "list1": {{FaultType: "typeB"}, {FaultType: "typeA"}}, + }, + } + + expDeviceInfo := &constant.AdvanceDeviceFaultCm{ + AvailableDevices: []string{"d1", "d2", "d3"}, + CardUnHealthy: []string{"c1", "c2"}, + NetworkUnhealthy: []string{"n1", "n2"}, + Recovering: []string{"r1", "r2"}, + FaultDeviceList: map[string][]constant.DeviceFault{ + "list1": {{FaultType: "typeA"}, {FaultType: "typeB"}}, + }, + } + + SortDataForAdvanceDeviceInfo(deviceInfo) + + convey.Convey("should be equal", func() { + convey.So(deviceInfo, convey.ShouldResemble, expDeviceInfo) + }) + }) +} + +func TestAddUnhealthy(t *testing.T) { + convey.Convey("Test addUnhealthy", t, func() { + baseCm := &constant.AdvanceDeviceFaultCm{ + AvailableDevices: []string{"device1", "device2"}, + CardUnHealthy: []string{}, + NetworkUnhealthy: []string{}, + FaultDeviceList: map[string][]constant.DeviceFault{ + "device1": { + {FaultType: constant.CardUnhealthy, FaultLevel: constant.SeparateNPU}, + }, + "device2": { + {FaultType: constant.CardNetworkUnhealthy, FaultLevel: constant.SeparateNPU}, + }, + }, + } + + convey.Convey("should add to CardUnHealthy for card unhealthy fault", func() { + addUnhealthy(baseCm) + convey.So(baseCm.CardUnHealthy, convey.ShouldContain, "device1") + convey.So(baseCm.NetworkUnhealthy, convey.ShouldContain, "device2") + convey.So(baseCm.AvailableDevices, convey.ShouldBeEmpty) + + }) + }) +} + +func TestMergeCode(t *testing.T) { + patches := gomonkey.NewPatches() + defer patches.Reset() + convey.Convey("Test mergeCode", t, func() { + testDevice := "device1" + orgFaults := []constant.DeviceFault{{FaultCode: "1001"}} + mergedFaults := []constant.DeviceFault{{FaultCode: "merged"}} + advanceCm := &constant.AdvanceDeviceFaultCm{ + FaultDeviceList: map[string][]constant.DeviceFault{ + testDevice: orgFaults, + }, + } + patches.ApplyFunc(mergeDeviceFault, func([]constant.DeviceFault) ([]constant.DeviceFault, error) { + return mergedFaults, nil + }) + convey.Convey("should skip empty fault lists", func() { + mergeCode(advanceCm) + convey.So(advanceCm.FaultDeviceList[testDevice], convey.ShouldResemble, mergedFaults) + }) + }) +} + +func TestRemoveUnhealthy(t *testing.T) { + patches := gomonkey.NewPatches() + defer patches.Reset() + convey.Convey("Test removeUnhealthy", t, func() { + baseCm := &constant.AdvanceDeviceFaultCm{ + AvailableDevices: []string{}, + CardUnHealthy: []string{"device1", "device2"}, + NetworkUnhealthy: []string{"device1", "device2"}, + FaultDeviceList: map[string][]constant.DeviceFault{ + "device1": { + {FaultType: constant.CardUnhealthy, FaultLevel: constant.NotHandleFault}, + }, + "device2": { + {FaultType: constant.CardNetworkUnhealthy, FaultLevel: constant.SubHealthFault}, + }, + }, + } + convey.Convey("should remove from unhealthy", func() { + removeUnhealthy(baseCm) + convey.So(baseCm.CardUnHealthy, convey.ShouldBeEmpty) + convey.So(baseCm.NetworkUnhealthy, convey.ShouldBeEmpty) + }) + }) +} -- Gitee From dd76f2e859299de8ea9aa6151122317fb6759cc3 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Tue, 22 Apr 2025 22:44:06 +0800 Subject: [PATCH 10/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20merge=20master=20conflict=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jobprocess/faultrank/job_fault_rank_processor.go | 6 +++--- .../jobprocess/faultrank/job_fault_rank_processor_test.go | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go index 262964ae2..5d1a317d2 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go @@ -79,7 +79,7 @@ func (processor *jobRankFaultInfoProcessor) findFaultRankForJob( nodeName string, serverList map[string]constant.ServerHccl, jobId string) []constant.FaultRank { devicesOfJobOnNode, ok := serverList[nodeName] faultRankList := make([]constant.FaultRank, 0) - if !ok || len(devicesOfJobOnNode.DeviceList) == 0 { + if advanceDeviceInfo == nil || !ok || len(devicesOfJobOnNode.DeviceList) == 0 { return faultRankList } for _, deviceInfo := range devicesOfJobOnNode.DeviceList { @@ -169,7 +169,7 @@ func (processor *jobRankFaultInfoProcessor) Process(info any) any { } hwlog.RunLog.Debugf("serverList: %d", len(serverList)) faultList, nodeStatusList, faultDeviceList := processor.findNodeDeviceAndSwitchFault(serverList, - nodeInfos, switchInfos, deviceCmForNodeMap, jobId) + allConfigmap.NodeCm, allConfigmap.SwitchCm, allConfigmap.DeviceCm, jobId) jobFaultInfo.FaultList = faultList if len(jobFaultInfo.FaultList) > 0 { hwlog.RunLog.Debugf("jobFaultInfo: %#v", jobFaultInfo) @@ -187,7 +187,7 @@ func (processor *jobRankFaultInfoProcessor) Process(info any) any { func (processor *jobRankFaultInfoProcessor) findNodeDeviceAndSwitchFault( serverList map[string]constant.ServerHccl, nodeInfos map[string]*constant.NodeInfo, - switchInfos map[string]*constant.SwitchInfo, deviceCmForNodeMap map[string]constant.AdvanceDeviceFaultCm, + switchInfos map[string]*constant.SwitchInfo, deviceCmForNodeMap map[string]*constant.AdvanceDeviceFaultCm, jobId string) ([]constant.FaultRank, []string, []constant.FaultDevice) { faultList := make([]constant.FaultRank, 0) faultDeviceList := make([]constant.FaultDevice, 0) diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go index 9b8b66723..d311b5d13 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go @@ -69,7 +69,11 @@ func TestFaultProcessorImplProcess(t *testing.T) { mockKube.Reset() mockJob.Reset() }() - JobFaultRankProcessor.Process(constant.AllConfigmapContent{}) + JobFaultRankProcessor.Process(constant.AllConfigmapContent{ + DeviceCm: make(map[string]*constant.AdvanceDeviceFaultCm), + SwitchCm: make(map[string]*constant.SwitchInfo), + NodeCm: make(map[string]*constant.NodeInfo), + }) faultRankInfos := JobFaultRankProcessor.GetJobFaultRankInfos() if len(faultRankInfos[jobId].FaultList) != len(jobServerMap.InfoMap[jobId][nodeName].DeviceList) { t.Error("TestFaultProcessorImplProcess fail") -- Gitee From 368596407d0a24dbb2e98bbfa8b0b06fb67fbb5f Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Wed, 23 Apr 2025 12:40:26 +0800 Subject: [PATCH 11/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20remove=203s=20limit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../faultmanager/cmprocess/base_fault_center.go | 17 ++--------------- .../faultmanager/jobprocess/fault_job_center.go | 15 +-------------- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go index 1021295b4..fca65eaee 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go @@ -4,18 +4,15 @@ package cmprocess import ( - "fmt" - "sync" - "time" - "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" "clusterd/pkg/domain/faultdomain/cmmanager" + "fmt" + "sync" ) type baseFaultCenter[T constant.ConfigMapInterface] struct { processorList []constant.FaultProcessor - lastProcessTime int64 subscribeChannelList []chan int mutex sync.Mutex processPeriod int64 @@ -26,7 +23,6 @@ type baseFaultCenter[T constant.ConfigMapInterface] struct { func newBaseFaultCenter[T constant.ConfigMapInterface](cmManager *cmmanager.FaultCenterCmManager[T], centerType int) baseFaultCenter[T] { return baseFaultCenter[T]{ processorList: make([]constant.FaultProcessor, 0), - lastProcessTime: 0, subscribeChannelList: make([]chan int, 0), mutex: sync.Mutex{}, processPeriod: constant.FaultCenterProcessPeriod, @@ -35,16 +31,7 @@ func newBaseFaultCenter[T constant.ConfigMapInterface](cmManager *cmmanager.Faul } } -func (baseCenter *baseFaultCenter[T]) isProcessLimited(currentTime int64) bool { - return baseCenter.lastProcessTime+baseCenter.processPeriod > currentTime -} - func (baseCenter *baseFaultCenter[T]) Process() { - currentTime := time.Now().UnixMilli() - if baseCenter.isProcessLimited(currentTime) { - return - } - baseCenter.lastProcessTime = currentTime updateOriginalCm := baseCenter.updateOriginalCm() processingCm := baseCenter.getOriginalCm() for _, processor := range baseCenter.processorList { diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go index 1bd6d380a..ebe879cb0 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go @@ -4,8 +4,6 @@ package jobprocess import ( - "time" - "clusterd/pkg/application/faultmanager/cmprocess" "clusterd/pkg/application/faultmanager/jobprocess/faultrank" "clusterd/pkg/application/faultmanager/jobprocess/relationfault" @@ -16,13 +14,11 @@ import ( var FaultJobCenter *faultJobProcessCenter type faultJobProcessCenter struct { - lastProcessTime int64 - processorList []constant.FaultProcessor + processorList []constant.FaultProcessor } func init() { FaultJobCenter = &faultJobProcessCenter{ - lastProcessTime: 0, processorList: []constant.FaultProcessor{ relationfault.RelationProcessor, faultrank.JobFaultRankProcessor, @@ -31,11 +27,6 @@ func init() { } func (fJobCenter *faultJobProcessCenter) Process() { - currentTime := time.Now().UnixMilli() - if fJobCenter.isProcessLimited(currentTime) { - return - } - fJobCenter.lastProcessTime = currentTime content := constant.AllConfigmapContent{ DeviceCm: cmprocess.DeviceCenter.GetProcessedCm(), SwitchCm: cmprocess.SwitchCenter.GetProcessedCm(), @@ -45,7 +36,3 @@ func (fJobCenter *faultJobProcessCenter) Process() { processor.Process(content) } } - -func (fJobCenter *faultJobProcessCenter) isProcessLimited(currentTime int64) bool { - return fJobCenter.lastProcessTime+constant.FaultCenterProcessPeriod > currentTime -} -- Gitee From 4cbcf61acaf86d7747fba629480c5c7a85665192 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Wed, 23 Apr 2025 13:03:23 +0800 Subject: [PATCH 12/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20debug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../application/faultmanager/cmprocess/base_fault_center.go | 2 -- .../faultmanager/cmprocess/base_fault_center_test.go | 6 ++++++ .../cmprocess/publicfault/pub_fault_processor.go | 1 + .../pkg/application/faultmanager/fault_process_center.go | 2 ++ component/clusterd/pkg/common/constant/constants.go | 1 - .../pkg/domain/faultdomain/cmmanager/configmap_manager.go | 2 ++ 6 files changed, 11 insertions(+), 3 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go index fca65eaee..07ea9aec6 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go @@ -15,7 +15,6 @@ type baseFaultCenter[T constant.ConfigMapInterface] struct { processorList []constant.FaultProcessor subscribeChannelList []chan int mutex sync.Mutex - processPeriod int64 cmManager *cmmanager.FaultCenterCmManager[T] centerType int } @@ -25,7 +24,6 @@ func newBaseFaultCenter[T constant.ConfigMapInterface](cmManager *cmmanager.Faul processorList: make([]constant.FaultProcessor, 0), subscribeChannelList: make([]chan int, 0), mutex: sync.Mutex{}, - processPeriod: constant.FaultCenterProcessPeriod, cmManager: cmManager, centerType: centerType, } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center_test.go index 5477643f0..16883407b 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center_test.go @@ -6,6 +6,7 @@ package cmprocess import ( "testing" + "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" "clusterd/pkg/domain/faultdomain/cmmanager" ) @@ -16,6 +17,11 @@ func (f *fakeProcessor) Process(info any) any { return info } +func TestMain(m *testing.M) { + hwlog.InitRunLogger(&hwlog.LogConfig{OnlyToStdout: true}, nil) + m.Run() +} + func TestBaseFaultCenterProcess(t *testing.T) { t.Run("TestBaseFaultCenterProcess", func(t *testing.T) { manager := cmmanager.DeviceCenterCmManager diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index 212e205bf..fd529097d 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -81,6 +81,7 @@ func (p *pubFaultProcessor) faultJoin() { } } faultdomain.FixUnhealthyInfo(p.devCMInfo) + faultdomain.SortDataForAdvanceDeviceInfo(p.devCMInfo) } func convertNPUIdsToName(phyIds []int32, devType string) []string { diff --git a/component/clusterd/pkg/application/faultmanager/fault_process_center.go b/component/clusterd/pkg/application/faultmanager/fault_process_center.go index ed021b81f..923bb57c5 100644 --- a/component/clusterd/pkg/application/faultmanager/fault_process_center.go +++ b/component/clusterd/pkg/application/faultmanager/fault_process_center.go @@ -32,10 +32,12 @@ type faultProcessCenter struct { } func (center *faultProcessCenter) Process() { + hwlog.RunLog.Info("begin process") cmprocess.DeviceCenter.Process() cmprocess.NodeCenter.Process() cmprocess.SwitchCenter.Process() jobprocess.FaultJobCenter.Process() + hwlog.RunLog.Info("end process") } func (center *faultProcessCenter) notifyFaultCenterProcess(whichToProcess int) { diff --git a/component/clusterd/pkg/common/constant/constants.go b/component/clusterd/pkg/common/constant/constants.go index 2500d3eb9..7d6845de0 100644 --- a/component/clusterd/pkg/common/constant/constants.go +++ b/component/clusterd/pkg/common/constant/constants.go @@ -80,7 +80,6 @@ const ( JobReportRecoverTimeout = 10 * 1000 JobReportInfoExpiredTimeout = 10 * 1000 JobReportCompleteTimeout = 30 * 1000 - FaultCenterProcessPeriod = 3 * 1000 MaxFaultCenterSubscriber = 10 UnknownFaultTime = -1 ) diff --git a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go index 795a71293..59392d654 100644 --- a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go +++ b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go @@ -57,6 +57,8 @@ func (manager *FaultCenterCmManager[T]) GetOriginalCm() ConfigMap[T] { func (manager *FaultCenterCmManager[T]) SetProcessedCm(cm ConfigMap[T]) bool { manager.mutex.Lock() defer manager.mutex.Unlock() + hwlog.RunLog.Infof("last processedCm info: %s", util.ObjToString(manager.processedCm.Data)) + hwlog.RunLog.Infof("this processedCm info: %s", util.ObjToString(cm.Data)) if manager.processedCm.equal(cm) { return false } -- Gitee From ca77b7a692089f55454ba3696a4e1d05025265ee Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Wed, 23 Apr 2025 22:23:51 +0800 Subject: [PATCH 13/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20fix=20sort=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../publicfault/pub_fault_processor.go | 8 +++- .../cmprocess/uce/uce_fault_processor.go | 6 +++ .../uce_accompany_fault_processor.go | 27 +++++++----- .../faultmanager/fault_process_center.go | 4 +- .../clusterd/pkg/common/constant/type.go | 42 ++++++++++++++----- component/clusterd/pkg/common/util/util.go | 28 +++++++++++++ .../cmmanager/configmap_manager.go | 2 - .../pkg/domain/faultdomain/fault_utils.go | 18 +++++++- .../resource/uce_fault_processor_test.yaml | 8 ++-- 9 files changed, 110 insertions(+), 33 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index fd529097d..3b77e63cb 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -60,6 +60,7 @@ func (p *pubFaultProcessor) Process(info any) any { } func (p *pubFaultProcessor) faultJoin() { + modified := false for _, pubFaultCache := range p.pubFaultInfo { // add public fault to fault list pubFaultCache.FaultDevNames = convertNPUIdsToName(pubFaultCache.FaultDevIds, p.devCMInfo.ServerType) @@ -78,10 +79,13 @@ func (p *pubFaultProcessor) faultJoin() { }}, } p.devCMInfo.FaultDeviceList[faultDevName] = append(p.devCMInfo.FaultDeviceList[faultDevName], fault) + modified = true } } - faultdomain.FixUnhealthyInfo(p.devCMInfo) - faultdomain.SortDataForAdvanceDeviceInfo(p.devCMInfo) + if modified { + faultdomain.FixUnhealthyInfo(p.devCMInfo) + faultdomain.SortDataForAdvanceDeviceInfo(p.devCMInfo) + } } func convertNPUIdsToName(phyIds []int32, devType string) []string { diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go index 6b75194a9..ccb588b4c 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go @@ -106,6 +106,7 @@ func (processor *uceFaultProcessor) processUceFaultInfo(currentTime int64) { func (processor *uceFaultProcessor) processEachNodeUceFaultInfo( nodeName string, deviceInfo *constant.AdvanceDeviceFaultCm, currentTime int64) *constant.AdvanceDeviceFaultCm { + modified := false for _, uceJob := range processor.uceDevicesOfUceJob { for deviceName, uceDevice := range uceJob.UceNode[nodeName].DeviceInfo { log := fmt.Sprintf("filter uce device: %s on node %s, "+ @@ -117,11 +118,16 @@ func (processor *uceFaultProcessor) processEachNodeUceFaultInfo( if processor.canFilterUceDeviceFaultInfo(uceDevice, currentTime) { hwlog.RunLog.Warn("uceFaultProcessor " + log) deviceInfo.FaultDeviceList = processor.filterUceDeviceFaultInfo(deviceName, deviceInfo.FaultDeviceList) + modified = true } else { hwlog.RunLog.Warn("uceFaultProcessor cannot " + log) } } } + if modified { + faultdomain.FixUnhealthyInfo(deviceInfo) + faultdomain.SortDataForAdvanceDeviceInfo(deviceInfo) + } return deviceInfo } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go index 878056d02..5bdc046e6 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go @@ -102,24 +102,22 @@ func (processor *uceAccompanyFaultProcessor) inQue(nodeName, deviceName string, func (processor *uceAccompanyFaultProcessor) filterFaultInfos(currentTime int64) { for nodeName, nodeFaults := range processor.uceAccompanyFaultQue { - faultMap, found := processor.deviceCmForNodeMap[nodeName] + deviceFaultCm, found := processor.deviceCmForNodeMap[nodeName] if !found { continue } for deviceName, deviceFaultQue := range nodeFaults { - newQue, newFaultMap := - processor.filterFaultDevice(faultMap.FaultDeviceList, currentTime, nodeName, deviceName, deviceFaultQue) + newQue := processor.filterFaultDevice(deviceFaultCm, currentTime, nodeName, deviceName, deviceFaultQue) nodeFaults[deviceName] = newQue - faultMap.FaultDeviceList = newFaultMap } - processor.deviceCmForNodeMap[nodeName] = faultMap } } func (processor *uceAccompanyFaultProcessor) filterFaultDevice( - faultMap map[string][]constant.DeviceFault, currentTime int64, nodeName, deviceName string, - deviceFaultQue []constant.DeviceFault) ([]constant.DeviceFault, map[string][]constant.DeviceFault) { + deviceFaultCm *constant.AdvanceDeviceFaultCm, currentTime int64, nodeName, deviceName string, + deviceFaultQue []constant.DeviceFault) []constant.DeviceFault { newDeviceFaultQue := make([]constant.DeviceFault, 0) + modified := false for _, fault := range deviceFaultQue { uceFaultTime := processor.getDeviceUceFaultTime(nodeName, deviceName) errorMsg := fmt.Sprintf("filterFaultDevice cannot find uce fault time for device %s of node %s", @@ -129,7 +127,8 @@ func (processor *uceAccompanyFaultProcessor) filterFaultDevice( if processor.isAccompaniedFaultByUce(uceFaultTime, accompanyFaultTime) { hwlog.RunLog.Warnf("filter uce accompany fault %s, fault time: %s", util.ObjToString(fault), util.ReadableMsTime(accompanyFaultTime)) - faultMap = faultdomain.DeleteFaultFromFaultMap(faultMap, fault) + deviceFaultCm.FaultDeviceList = faultdomain.DeleteFaultFromFaultMap(deviceFaultCm.FaultDeviceList, fault) + modified = true continue } // if current is not exceed diagnosis time, @@ -137,16 +136,22 @@ func (processor *uceAccompanyFaultProcessor) filterFaultDevice( if !processor.isCurrentExceedDiagnosisTimeout(currentTime, accompanyFaultTime) { hwlog.RunLog.Warnf("filter uce accompany like fault %s, fault time: %s", util.ObjToString(fault), util.ReadableMsTime(accompanyFaultTime)) - faultMap = faultdomain.DeleteFaultFromFaultMap(faultMap, fault) + deviceFaultCm.FaultDeviceList = faultdomain.DeleteFaultFromFaultMap(deviceFaultCm.FaultDeviceList, fault) + modified = true newDeviceFaultQue = append(newDeviceFaultQue, fault) continue } // cannot filter, add the aic/aiv fault into faultMap - faultMap = faultdomain.AddFaultIntoFaultMap(faultMap, fault) + deviceFaultCm.FaultDeviceList = faultdomain.AddFaultIntoFaultMap(deviceFaultCm.FaultDeviceList, fault) + modified = true hwlog.RunLog.Warnf("cannot filter uce accompany like fault %s, uce fault time: %s", util.ObjToString(fault), util.ReadableMsTime(uceFaultTime)) } - return newDeviceFaultQue, faultMap + if modified { + faultdomain.FixUnhealthyInfo(deviceFaultCm) + faultdomain.SortDataForAdvanceDeviceInfo(deviceFaultCm) + } + return newDeviceFaultQue } func (processor *uceAccompanyFaultProcessor) getDeviceUceFaultTime(nodeName, deviceName string) int64 { diff --git a/component/clusterd/pkg/application/faultmanager/fault_process_center.go b/component/clusterd/pkg/application/faultmanager/fault_process_center.go index 923bb57c5..33015cedc 100644 --- a/component/clusterd/pkg/application/faultmanager/fault_process_center.go +++ b/component/clusterd/pkg/application/faultmanager/fault_process_center.go @@ -47,12 +47,12 @@ func (center *faultProcessCenter) notifyFaultCenterProcess(whichToProcess int) { // Work faultProcessCenter work goroutine func (center *faultProcessCenter) Work(ctx context.Context) { go func() { - hwlog.RunLog.Info("faultProcessCenter start work") + hwlog.RunLog.Info("faultProcessCenter start work!") centerTicker := time.NewTicker(time.Second) for { select { case <-ctx.Done(): - hwlog.RunLog.Info("faultProcessCenter stop work") + hwlog.RunLog.Info("faultProcessCenter stop work!") return case whichToProcess := <-center.notifyProcessChan: switch whichToProcess { diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index 8837a60c7..fbc983be4 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -4,10 +4,10 @@ package constant import ( - "reflect" - "ascend-common/api" "ascend-common/common-utils/hwlog" + "clusterd/pkg/common/util" + "k8s.io/utils/strings/slices" ) // FaultTimeAndLevel of each fault code @@ -29,6 +29,16 @@ type DeviceFault struct { FaultTimeAndLevelMap map[string]FaultTimeAndLevel `json:"fault_time_and_level_map"` } +func equalDeviceFault(one, other *DeviceFault) bool { + return one.FaultType == other.FaultType && + one.NPUName == other.NPUName && + one.LargeModelFaultLevel == other.LargeModelFaultLevel && + one.FaultLevel == other.FaultLevel && + one.FaultHandling == other.FaultHandling && + one.FaultCode == other.FaultCode && + util.MapEqual(one.FaultTimeAndLevelMap, other.FaultTimeAndLevelMap) +} + // NodeInfoCM the config map struct of node info type NodeInfoCM struct { NodeInfo NodeInfoNoName @@ -233,15 +243,27 @@ type AdvanceDeviceFaultCm struct { // IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { - thisUpdateTime := cm.UpdateTime thatCm := another.(*AdvanceDeviceFaultCm) - thatUpdateTime := thatCm.UpdateTime - cm.UpdateTime = 0 - thatCm.UpdateTime = 0 - isSame := reflect.DeepEqual(cm, thatCm) - cm.UpdateTime = thisUpdateTime - thatCm.UpdateTime = thatUpdateTime - return isSame + eq := func(faultListOne []DeviceFault, faultListOther []DeviceFault) bool { + if len(faultListOne) != len(faultListOther) { + return false + } + for i, fault := range faultListOne { + if !equalDeviceFault(&fault, &faultListOther[i]) { + return false + } + } + return true + } + return cm.ServerType == thatCm.ServerType && + cm.CmName == thatCm.CmName && + cm.SuperPodID == thatCm.SuperPodID && + cm.ServerIndex == thatCm.ServerIndex && + slices.Equal(cm.AvailableDevices, thatCm.AvailableDevices) && + slices.Equal(cm.Recovering, thatCm.Recovering) && + slices.Equal(cm.CardUnHealthy, thatCm.CardUnHealthy) && + slices.Equal(cm.NetworkUnhealthy, thatCm.NetworkUnhealthy) && + util.MapEqualFunc(cm.FaultDeviceList, thatCm.FaultDeviceList, eq) } // GetCmName return cm name diff --git a/component/clusterd/pkg/common/util/util.go b/component/clusterd/pkg/common/util/util.go index 72f3eb2c4..313b8c828 100644 --- a/component/clusterd/pkg/common/util/util.go +++ b/component/clusterd/pkg/common/util/util.go @@ -180,3 +180,31 @@ func RemoveDuplicates[T comparable](slice []T) []T { } return result } + +// MapEqual reports whether two maps contain the same key/value pairs. +// Values are compared using ==. +func MapEqual[M1, M2 ~map[K]V, K, V comparable](m1 M1, m2 M2) bool { + if len(m1) != len(m2) { + return false + } + for k, v1 := range m1 { + if v2, ok := m2[k]; !ok || v1 != v2 { + return false + } + } + return true +} + +// MapEqualFunc is like Equal, but compares values using eq. +// Keys are still compared with ==. +func MapEqualFunc[M1 ~map[K]V1, M2 ~map[K]V2, K comparable, V1, V2 any](m1 M1, m2 M2, eq func(V1, V2) bool) bool { + if len(m1) != len(m2) { + return false + } + for k, v1 := range m1 { + if v2, ok := m2[k]; !ok || !eq(v1, v2) { + return false + } + } + return true +} diff --git a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go index 59392d654..795a71293 100644 --- a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go +++ b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager.go @@ -57,8 +57,6 @@ func (manager *FaultCenterCmManager[T]) GetOriginalCm() ConfigMap[T] { func (manager *FaultCenterCmManager[T]) SetProcessedCm(cm ConfigMap[T]) bool { manager.mutex.Lock() defer manager.mutex.Unlock() - hwlog.RunLog.Infof("last processedCm info: %s", util.ObjToString(manager.processedCm.Data)) - hwlog.RunLog.Infof("this processedCm info: %s", util.ObjToString(cm.Data)) if manager.processedCm.equal(cm) { return false } diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 28998d74a..0297065fd 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -7,13 +7,13 @@ import ( "encoding/json" "fmt" "reflect" - "slices" "sort" "strings" "time" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/utils/strings/slices" "ascend-common/api" "ascend-common/common-utils/hwlog" @@ -387,17 +387,25 @@ func isFaultDeletable(faults []constant.DeviceFault, faultTypes []string, faultL } func removeUnhealthy(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { + newCardUnHealthy := make([]string, 0) + newNetworkUnhealthy := make([]string, 0) for deviceName, faults := range advanceDeviceCm.FaultDeviceList { deletableFaultLevels := []string{constant.NotHandleFault, constant.SubHealthFault} if isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { advanceDeviceCm.CardUnHealthy = util.DeleteStringSliceItem(advanceDeviceCm.CardUnHealthy, deviceName) hwlog.RunLog.Debugf("remove device %s from CardUnHealthy", deviceName) + } else { + newCardUnHealthy = append(newCardUnHealthy, deviceName) } if isFaultDeletable(faults, []string{constant.CardNetworkUnhealthy}, deletableFaultLevels) { advanceDeviceCm.NetworkUnhealthy = util.DeleteStringSliceItem(advanceDeviceCm.NetworkUnhealthy, deviceName) hwlog.RunLog.Debugf("remove device %s from NetworkUnhealthy", deviceName) + } else { + newNetworkUnhealthy = append(newCardUnHealthy, deviceName) } } + advanceDeviceCm.CardUnHealthy = newCardUnHealthy + advanceDeviceCm.NetworkUnhealthy = newNetworkUnhealthy } func mergeCode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { @@ -588,13 +596,19 @@ func ValidBusinessUceReportInfo(info *constant.ReportInfo) bool { return ValidBusinessRecoverTime(info.RecoverTime) } +// SortDataForAdvanceDeviceInfo sort the field of deviceInfo func SortDataForAdvanceDeviceInfo(deviceInfo *constant.AdvanceDeviceFaultCm) { sort.Strings(deviceInfo.AvailableDevices) sort.Strings(deviceInfo.CardUnHealthy) sort.Strings(deviceInfo.NetworkUnhealthy) sort.Strings(deviceInfo.Recovering) for _, faultList := range deviceInfo.FaultDeviceList { - slices.SortFunc(faultList, compareDeviceFault) + sort.Slice(faultList, func(i, j int) bool { + if compareDeviceFault(faultList[i], faultList[j]) <= 0 { + return true + } + return false + }) } } diff --git a/component/clusterd/testdata/resource/uce_fault_processor_test.yaml b/component/clusterd/testdata/resource/uce_fault_processor_test.yaml index 9ead1d0d0..52f420ac3 100644 --- a/component/clusterd/testdata/resource/uce_fault_processor_test.yaml +++ b/component/clusterd/testdata/resource/uce_fault_processor_test.yaml @@ -18,7 +18,7 @@ mindx-dl-deviceinfo-node1: "22222222": { "fault_time": 100000, "fault_level": "RestartBusiness" }, "33333333": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 + huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 mindx-dl-deviceinfo-node2: CmName: "mindx-dl-deviceinfo-node2" DeviceList: @@ -36,7 +36,7 @@ mindx-dl-deviceinfo-node2: { "fault_type": "CardUnhealthy", "npu_name": "Ascend910-4", "fault_code": "80E01801", "fault_time_and_level_map": { "80E01801": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 + huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 --- # the result after uce processor process mindx-dl-deviceinfo-node1: @@ -50,7 +50,7 @@ mindx-dl-deviceinfo-node1: "22222222": { "fault_time": 100000, "fault_level": "RestartBusiness" }, "33333333": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-4 + huawei.com/Ascend910-CardUnhealthy: Ascend910-4 mindx-dl-deviceinfo-node2: CmName: "mindx-dl-deviceinfo-node2" DeviceList: @@ -60,7 +60,7 @@ mindx-dl-deviceinfo-node2: "large_model_fault_level":"RestartBusiness","fault_level":"RestartBusiness","fault_handling":"RestartBusiness", "fault_time_and_level_map": { "22222222": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-Unhealthy: Ascend910-0 + huawei.com/Ascend910-CardUnhealthy: Ascend910-0 --- # the output of processor.getUceDeviceOfNodes() node1: -- Gitee From ffe411cda5ed3f6a171a7f1e9bc3f3112da42a9b Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Thu, 24 Apr 2025 00:39:02 +0800 Subject: [PATCH 14/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20fix=20update=20time=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/clusterd/pkg/domain/faultdomain/fault_utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 0297065fd..56b6eb2d0 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -324,7 +324,7 @@ func AdvanceDeviceFaultCmToOriginalCmForNode(advanceDeviceCm *constant.AdvanceDe orgDeviceCm := &constant.DeviceInfo{ DeviceInfoNoName: constant.DeviceInfoNoName{ DeviceList: make(map[string]string), - UpdateTime: 0, + UpdateTime: advanceDeviceCm.UpdateTime, }, CmName: advanceDeviceCm.CmName, SuperPodID: advanceDeviceCm.SuperPodID, -- Gitee From 9bb0e2aed988329c86491bee81674ce918941677 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Thu, 24 Apr 2025 00:57:45 +0800 Subject: [PATCH 15/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20fix=20order?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/clusterd/pkg/domain/faultdomain/fault_utils.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 56b6eb2d0..6a2117cf1 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -218,7 +218,9 @@ func splitDeviceFault(faultInfo constant.DeviceFault, nodeName string) []constan func mergeDeviceFault(notGroupDeviceFaults []constant.DeviceFault) ([]constant.DeviceFault, error) { faultsGroupByType := faultsGroupByType(notGroupDeviceFaults) result := make([]constant.DeviceFault, 0) - for _, faultsGroup := range faultsGroupByType { + faultTypes := getSortedKeys(faultsGroupByType) + for _, faultType := range faultTypes { + faultsGroup := faultsGroupByType[faultType] deviceName := faultsGroup[0].NPUName fautLevels := make([]string, 0) newTimeAndLevelMap := make(map[string]constant.FaultTimeAndLevel, len(faultsGroup)) @@ -359,8 +361,9 @@ func AdvanceDeviceFaultCmToOriginalCmForNode(advanceDeviceCm *constant.AdvanceDe func faultMapToFaultList(deviceFaultMap map[string][]constant.DeviceFault) []constant.DeviceFault { deviceFaultList := make([]constant.DeviceFault, 0) - for _, faultList := range deviceFaultMap { - deviceFaultList = append(deviceFaultList, faultList...) + deviceNames := getSortedKeys(deviceFaultMap) + for _, deviceName := range deviceNames { + deviceFaultList = append(deviceFaultList, deviceFaultMap[deviceName]...) } return deviceFaultList } -- Gitee From 42e02bb82c8ac6e59aea989539d15c011821cc1b Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Thu, 24 Apr 2025 11:45:46 +0800 Subject: [PATCH 16/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20fix=20AvailableDeviceList?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../publicfault/pub_fault_processor_test.go | 2 +- .../clusterd/pkg/common/constant/type.go | 27 ++++++---- .../cmmanager/configmap_manager_test.go | 12 ++--- .../pkg/domain/faultdomain/fault_utils.go | 12 +++-- .../domain/faultdomain/fault_utils_test.go | 50 +++++++++---------- 5 files changed, 57 insertions(+), 46 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go index d15895fba..672edaca5 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor_test.go @@ -129,6 +129,6 @@ func sortDeviceFaultList(advanceFaultCm map[string]*constant.AdvanceDeviceFaultC sort.Strings(advanceDeviceCm.CardUnHealthy) sort.Strings(advanceDeviceCm.NetworkUnhealthy) sort.Strings(advanceDeviceCm.Recovering) - sort.Strings(advanceDeviceCm.AvailableDevices) + sort.Strings(advanceDeviceCm.AvailableDeviceList) } } diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index fbc983be4..b7edd8466 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -229,16 +229,16 @@ type FaultProcessor interface { // AdvanceDeviceFaultCm more structure device info type AdvanceDeviceFaultCm struct { - ServerType string - CmName string - SuperPodID int32 - ServerIndex int32 - FaultDeviceList map[string][]DeviceFault - AvailableDevices []string - Recovering []string - CardUnHealthy []string - NetworkUnhealthy []string - UpdateTime int64 + ServerType string + CmName string + SuperPodID int32 + ServerIndex int32 + FaultDeviceList map[string][]DeviceFault + AvailableDeviceList []string + Recovering []string + CardUnHealthy []string + NetworkUnhealthy []string + UpdateTime int64 } // IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime @@ -259,7 +259,7 @@ func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { cm.CmName == thatCm.CmName && cm.SuperPodID == thatCm.SuperPodID && cm.ServerIndex == thatCm.ServerIndex && - slices.Equal(cm.AvailableDevices, thatCm.AvailableDevices) && + slices.Equal(cm.AvailableDeviceList, thatCm.AvailableDeviceList) && slices.Equal(cm.Recovering, thatCm.Recovering) && slices.Equal(cm.CardUnHealthy, thatCm.CardUnHealthy) && slices.Equal(cm.NetworkUnhealthy, thatCm.NetworkUnhealthy) && @@ -291,6 +291,11 @@ func (cm *AdvanceDeviceFaultCm) GetFaultDeviceListKey() string { return api.ResourceNamePrefix + cm.ServerType + CmFaultListSuffix } +// GetAvailableDeviceListKey return cm AvailableDeviceListKey +func (cm *AdvanceDeviceFaultCm) GetAvailableDeviceListKey() string { + return api.ResourceNamePrefix + cm.ServerType +} + // InformerCmItem informer configmap item of queue or buffer type InformerCmItem[T ConfigMapInterface] struct { IsAdd bool diff --git a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go index 16c681d0f..89c051fe4 100644 --- a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go +++ b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go @@ -65,12 +65,12 @@ func TestConfigMapEqual(t *testing.T) { cm1 := ConfigMap[*constant.AdvanceDeviceFaultCm]{ Data: map[string]*constant.AdvanceDeviceFaultCm{ "node": { - FaultDeviceList: nil, - AvailableDevices: []string{"2", "1"}, - Recovering: []string{"4", "3"}, - CardUnHealthy: []string{"6", "5"}, - NetworkUnhealthy: []string{"6", "5"}, - UpdateTime: 10, + FaultDeviceList: nil, + AvailableDeviceList: []string{"2", "1"}, + Recovering: []string{"4", "3"}, + CardUnHealthy: []string{"6", "5"}, + NetworkUnhealthy: []string{"6", "5"}, + UpdateTime: 10, }, }, } diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index 6a2117cf1..cfe92b193 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -91,7 +91,7 @@ func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFau advanceDeviceCm.FaultDeviceList = getFaultListInfo(devInfo) advanceDeviceCm.NetworkUnhealthy = getNetworkUnhealthyCardList(devInfo) advanceDeviceCm.CardUnHealthy = getCardUnHealthy(devInfo) - advanceDeviceCm.AvailableDevices = getAvailableDevices(devInfo) + advanceDeviceCm.AvailableDeviceList = getAvailableDevices(devInfo) advanceDeviceCm.Recovering = getRecoveringDevList(devInfo) return advanceDeviceCm } @@ -356,6 +356,12 @@ func AdvanceDeviceFaultCmToOriginalCmForNode(advanceDeviceCm *constant.AdvanceDe orgDeviceCm.DeviceList[advanceDeviceCm.GetRecoveringKey()] = strings.Join(advanceDeviceCm.Recovering, ",") } + + orgDeviceCm.DeviceList[advanceDeviceCm.GetAvailableDeviceListKey()] = "" + if len(advanceDeviceCm.AvailableDeviceList) > 0 { + orgDeviceCm.DeviceList[advanceDeviceCm.GetAvailableDeviceListKey()] = + strings.Join(advanceDeviceCm.AvailableDeviceList, ",") + } return orgDeviceCm } @@ -450,7 +456,7 @@ func addUnhealthy(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { sets.NewString(advanceDeviceCm.CardUnHealthy...).Insert(deviceName).List() } if shouldAddInCardUnhealthy || shouldAddInCardNetworkUnhealthy { - advanceDeviceCm.AvailableDevices = util.DeleteStringSliceItem(advanceDeviceCm.AvailableDevices, deviceName) + advanceDeviceCm.AvailableDeviceList = util.DeleteStringSliceItem(advanceDeviceCm.AvailableDeviceList, deviceName) } } } @@ -601,7 +607,7 @@ func ValidBusinessUceReportInfo(info *constant.ReportInfo) bool { // SortDataForAdvanceDeviceInfo sort the field of deviceInfo func SortDataForAdvanceDeviceInfo(deviceInfo *constant.AdvanceDeviceFaultCm) { - sort.Strings(deviceInfo.AvailableDevices) + sort.Strings(deviceInfo.AvailableDeviceList) sort.Strings(deviceInfo.CardUnHealthy) sort.Strings(deviceInfo.NetworkUnhealthy) sort.Strings(deviceInfo.Recovering) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index e312d3117..206285e81 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -599,16 +599,16 @@ func TestAdvanceFaultCmToOriginalFaultCm(t *testing.T) { node1 := "node1" node2 := "node2" mockAdvanceCm1 := &constant.AdvanceDeviceFaultCm{ - ServerType: "", - CmName: "CmName-" + node1, - SuperPodID: 0, - ServerIndex: 0, - FaultDeviceList: make(map[string][]constant.DeviceFault), - AvailableDevices: []string{"xxx"}, - Recovering: []string{"xxx"}, - CardUnHealthy: []string{"xxx"}, - NetworkUnhealthy: []string{"xxx"}, - UpdateTime: 0, + ServerType: "", + CmName: "CmName-" + node1, + SuperPodID: 0, + ServerIndex: 0, + FaultDeviceList: make(map[string][]constant.DeviceFault), + AvailableDeviceList: []string{"xxx"}, + Recovering: []string{"xxx"}, + CardUnHealthy: []string{"xxx"}, + NetworkUnhealthy: []string{"xxx"}, + UpdateTime: 0, } mockAdvanceCm2 := new(constant.AdvanceDeviceFaultCm) util.DeepCopy(mockAdvanceCm1, mockAdvanceCm2) @@ -723,20 +723,20 @@ func TestSortDataForAdvanceDeviceInfo(t *testing.T) { // Mock dependencies deviceInfo := &constant.AdvanceDeviceFaultCm{ - AvailableDevices: []string{"d3", "d1", "d2"}, - CardUnHealthy: []string{"c2", "c1"}, - NetworkUnhealthy: []string{"n2", "n1"}, - Recovering: []string{"r2", "r1"}, + AvailableDeviceList: []string{"d3", "d1", "d2"}, + CardUnHealthy: []string{"c2", "c1"}, + NetworkUnhealthy: []string{"n2", "n1"}, + Recovering: []string{"r2", "r1"}, FaultDeviceList: map[string][]constant.DeviceFault{ "list1": {{FaultType: "typeB"}, {FaultType: "typeA"}}, }, } expDeviceInfo := &constant.AdvanceDeviceFaultCm{ - AvailableDevices: []string{"d1", "d2", "d3"}, - CardUnHealthy: []string{"c1", "c2"}, - NetworkUnhealthy: []string{"n1", "n2"}, - Recovering: []string{"r1", "r2"}, + AvailableDeviceList: []string{"d1", "d2", "d3"}, + CardUnHealthy: []string{"c1", "c2"}, + NetworkUnhealthy: []string{"n1", "n2"}, + Recovering: []string{"r1", "r2"}, FaultDeviceList: map[string][]constant.DeviceFault{ "list1": {{FaultType: "typeA"}, {FaultType: "typeB"}}, }, @@ -753,9 +753,9 @@ func TestSortDataForAdvanceDeviceInfo(t *testing.T) { func TestAddUnhealthy(t *testing.T) { convey.Convey("Test addUnhealthy", t, func() { baseCm := &constant.AdvanceDeviceFaultCm{ - AvailableDevices: []string{"device1", "device2"}, - CardUnHealthy: []string{}, - NetworkUnhealthy: []string{}, + AvailableDeviceList: []string{"device1", "device2"}, + CardUnHealthy: []string{}, + NetworkUnhealthy: []string{}, FaultDeviceList: map[string][]constant.DeviceFault{ "device1": { {FaultType: constant.CardUnhealthy, FaultLevel: constant.SeparateNPU}, @@ -770,7 +770,7 @@ func TestAddUnhealthy(t *testing.T) { addUnhealthy(baseCm) convey.So(baseCm.CardUnHealthy, convey.ShouldContain, "device1") convey.So(baseCm.NetworkUnhealthy, convey.ShouldContain, "device2") - convey.So(baseCm.AvailableDevices, convey.ShouldBeEmpty) + convey.So(baseCm.AvailableDeviceList, convey.ShouldBeEmpty) }) }) @@ -803,9 +803,9 @@ func TestRemoveUnhealthy(t *testing.T) { defer patches.Reset() convey.Convey("Test removeUnhealthy", t, func() { baseCm := &constant.AdvanceDeviceFaultCm{ - AvailableDevices: []string{}, - CardUnHealthy: []string{"device1", "device2"}, - NetworkUnhealthy: []string{"device1", "device2"}, + AvailableDeviceList: []string{}, + CardUnHealthy: []string{"device1", "device2"}, + NetworkUnhealthy: []string{"device1", "device2"}, FaultDeviceList: map[string][]constant.DeviceFault{ "device1": { {FaultType: constant.CardUnhealthy, FaultLevel: constant.NotHandleFault}, -- Gitee From 86ca7621790380cf42dbfff5ffeb8e8688a14dff Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Thu, 24 Apr 2025 14:49:46 +0800 Subject: [PATCH 17/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20fix=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../clusterd/pkg/common/constant/const.go | 4 +- .../pkg/domain/faultdomain/fault_utils.go | 65 ++++--------------- .../domain/faultdomain/fault_utils_test.go | 53 ++++++--------- .../resource/pub_fault_processor_test.yaml | 8 +-- .../resource/uce_fault_processor_test.yaml | 8 +-- 5 files changed, 43 insertions(+), 95 deletions(-) diff --git a/component/clusterd/pkg/common/constant/const.go b/component/clusterd/pkg/common/constant/const.go index b2bbd079e..8d25e33a4 100644 --- a/component/clusterd/pkg/common/constant/const.go +++ b/component/clusterd/pkg/common/constant/const.go @@ -184,9 +184,9 @@ const ( // CmRecoveringSuffix Recovering Suffix CmRecoveringSuffix = "-Recovering" // CmCardUnhealthySuffix CardUnhealthy Suffix - CmCardUnhealthySuffix = "-CardUnhealthy" + CmCardUnhealthySuffix = "-Unhealthy" // CmCardNetworkUnhealthySuffix NetworkUnhealthy Suffix - CmCardNetworkUnhealthySuffix = "-CardNetworkUnhealthy" + CmCardNetworkUnhealthySuffix = "-NetworkUnhealthy" // CmFaultListSuffix FaultList Suffix CmFaultListSuffix = "-Fault" ) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index cfe92b193..fdc0d46b9 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -395,28 +395,6 @@ func isFaultDeletable(faults []constant.DeviceFault, faultTypes []string, faultL return true } -func removeUnhealthy(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { - newCardUnHealthy := make([]string, 0) - newNetworkUnhealthy := make([]string, 0) - for deviceName, faults := range advanceDeviceCm.FaultDeviceList { - deletableFaultLevels := []string{constant.NotHandleFault, constant.SubHealthFault} - if isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { - advanceDeviceCm.CardUnHealthy = util.DeleteStringSliceItem(advanceDeviceCm.CardUnHealthy, deviceName) - hwlog.RunLog.Debugf("remove device %s from CardUnHealthy", deviceName) - } else { - newCardUnHealthy = append(newCardUnHealthy, deviceName) - } - if isFaultDeletable(faults, []string{constant.CardNetworkUnhealthy}, deletableFaultLevels) { - advanceDeviceCm.NetworkUnhealthy = util.DeleteStringSliceItem(advanceDeviceCm.NetworkUnhealthy, deviceName) - hwlog.RunLog.Debugf("remove device %s from NetworkUnhealthy", deviceName) - } else { - newNetworkUnhealthy = append(newCardUnHealthy, deviceName) - } - } - advanceDeviceCm.CardUnHealthy = newCardUnHealthy - advanceDeviceCm.NetworkUnhealthy = newNetworkUnhealthy -} - func mergeCode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { for deviceName, faults := range advanceDeviceCm.FaultDeviceList { if len(faults) == 0 { @@ -431,39 +409,24 @@ func mergeCode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { } } -func addUnhealthy(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { +func FixUnhealthyInfo(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { + newCardUnHealthy := make([]string, 0) + newNetworkUnhealthy := make([]string, 0) + availableDeviceList := advanceDeviceCm.AvailableDeviceList for deviceName, faults := range advanceDeviceCm.FaultDeviceList { - shouldAddInCardUnhealthy := false - shouldAddInCardNetworkUnhealthy := false - for _, fault := range faults { - if fault.FaultType == constant.CardUnhealthy || fault.FaultType == constant.PublicFaultType { - if fault.FaultLevel != constant.NormalNPU && fault.FaultLevel != constant.NotHandleFault { - shouldAddInCardUnhealthy = true - } - } - if fault.FaultType == constant.CardNetworkUnhealthy { - if fault.FaultLevel != constant.NormalNetwork && fault.FaultLevel != constant.NotHandleFault { - shouldAddInCardNetworkUnhealthy = true - } - } - } - if shouldAddInCardNetworkUnhealthy { - advanceDeviceCm.NetworkUnhealthy = - sets.NewString(advanceDeviceCm.NetworkUnhealthy...).Insert(deviceName).List() - } - if shouldAddInCardUnhealthy { - advanceDeviceCm.CardUnHealthy = - sets.NewString(advanceDeviceCm.CardUnHealthy...).Insert(deviceName).List() + deletableFaultLevels := []string{constant.NotHandleFault, constant.SubHealthFault} + if !isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { + newCardUnHealthy = append(newCardUnHealthy, deviceName) + availableDeviceList = util.DeleteStringSliceItem(availableDeviceList, deviceName) } - if shouldAddInCardUnhealthy || shouldAddInCardNetworkUnhealthy { - advanceDeviceCm.AvailableDeviceList = util.DeleteStringSliceItem(advanceDeviceCm.AvailableDeviceList, deviceName) + if !isFaultDeletable(faults, []string{constant.CardNetworkUnhealthy}, deletableFaultLevels) { + newNetworkUnhealthy = append(newNetworkUnhealthy, deviceName) + availableDeviceList = util.DeleteStringSliceItem(availableDeviceList, deviceName) } } -} - -func FixUnhealthyInfo(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { - removeUnhealthy(advanceDeviceCm) - addUnhealthy(advanceDeviceCm) + advanceDeviceCm.CardUnHealthy = newCardUnHealthy + advanceDeviceCm.NetworkUnhealthy = newNetworkUnhealthy + advanceDeviceCm.AvailableDeviceList = availableDeviceList } func getNetworkUnhealthyString(devInfo *constant.DeviceInfo) (string, string) { diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index 206285e81..ec27255bc 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -750,32 +750,6 @@ func TestSortDataForAdvanceDeviceInfo(t *testing.T) { }) } -func TestAddUnhealthy(t *testing.T) { - convey.Convey("Test addUnhealthy", t, func() { - baseCm := &constant.AdvanceDeviceFaultCm{ - AvailableDeviceList: []string{"device1", "device2"}, - CardUnHealthy: []string{}, - NetworkUnhealthy: []string{}, - FaultDeviceList: map[string][]constant.DeviceFault{ - "device1": { - {FaultType: constant.CardUnhealthy, FaultLevel: constant.SeparateNPU}, - }, - "device2": { - {FaultType: constant.CardNetworkUnhealthy, FaultLevel: constant.SeparateNPU}, - }, - }, - } - - convey.Convey("should add to CardUnHealthy for card unhealthy fault", func() { - addUnhealthy(baseCm) - convey.So(baseCm.CardUnHealthy, convey.ShouldContain, "device1") - convey.So(baseCm.NetworkUnhealthy, convey.ShouldContain, "device2") - convey.So(baseCm.AvailableDeviceList, convey.ShouldBeEmpty) - - }) - }) -} - func TestMergeCode(t *testing.T) { patches := gomonkey.NewPatches() defer patches.Reset() @@ -798,14 +772,14 @@ func TestMergeCode(t *testing.T) { }) } -func TestRemoveUnhealthy(t *testing.T) { +func TestFixUnhealthyInfo(t *testing.T) { patches := gomonkey.NewPatches() defer patches.Reset() - convey.Convey("Test removeUnhealthy", t, func() { + convey.Convey("Test FixUnhealthyInfo", t, func() { baseCm := &constant.AdvanceDeviceFaultCm{ - AvailableDeviceList: []string{}, - CardUnHealthy: []string{"device1", "device2"}, - NetworkUnhealthy: []string{"device1", "device2"}, + AvailableDeviceList: []string{"device0", "device1", "device2", "device3", "device4", "device5", "device6"}, + CardUnHealthy: []string{"device1", "device2", "device3", "device4"}, + NetworkUnhealthy: []string{"device1", "device2", "device3"}, FaultDeviceList: map[string][]constant.DeviceFault{ "device1": { {FaultType: constant.CardUnhealthy, FaultLevel: constant.NotHandleFault}, @@ -813,12 +787,23 @@ func TestRemoveUnhealthy(t *testing.T) { "device2": { {FaultType: constant.CardNetworkUnhealthy, FaultLevel: constant.SubHealthFault}, }, + "device3": { + {FaultType: constant.PublicFaultType, FaultLevel: constant.SubHealthFault}, + }, + "device4": { + {FaultType: constant.PublicFaultType, FaultLevel: constant.SeparateNPU}, + }, + "device5": { + {FaultType: constant.CardNetworkUnhealthy, FaultLevel: constant.SeparateNPU}, + }, }, } convey.Convey("should remove from unhealthy", func() { - removeUnhealthy(baseCm) - convey.So(baseCm.CardUnHealthy, convey.ShouldBeEmpty) - convey.So(baseCm.NetworkUnhealthy, convey.ShouldBeEmpty) + FixUnhealthyInfo(baseCm) + convey.So(baseCm.CardUnHealthy, convey.ShouldResemble, []string{"device4"}) + convey.So(baseCm.NetworkUnhealthy, convey.ShouldResemble, []string{"device5"}) + convey.So(baseCm.AvailableDeviceList, convey.ShouldResemble, + []string{"device0", "device1", "device2", "device3", "device6"}) }) }) } diff --git a/component/clusterd/testdata/resource/pub_fault_processor_test.yaml b/component/clusterd/testdata/resource/pub_fault_processor_test.yaml index 05c212444..c8783d560 100644 --- a/component/clusterd/testdata/resource/pub_fault_processor_test.yaml +++ b/component/clusterd/testdata/resource/pub_fault_processor_test.yaml @@ -9,7 +9,7 @@ mindx-dl-deviceinfo-node1: "large_model_fault_level": "PreSeparateNPU", "fault_level": "PreSeparateNPU", "fault_handling": "PreSeparateNPU", "fault_time_and_level_map": { "4C1F8608": { "fault_time": 1234567890, "fault_level": "PreSeparateNPU" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-5 + huawei.com/Ascend910-Unhealthy: Ascend910-5 --- # device info cm after public fault processor. public fault: unhealthy card is Ascend910-0, Ascend910-1 mindx-dl-deviceinfo-node1: @@ -28,7 +28,7 @@ mindx-dl-deviceinfo-node1: "large_model_fault_level": "SeparateNPU", "fault_level": "SeparateNPU", "fault_handling": "SeparateNPU", "fault_time_and_level_map": { "010001001": { "fault_time": 1739866717, "fault_level": "SeparateNPU" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-1,Ascend910-5 + huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-1,Ascend910-5 --- # device info cm: unhealthy card is Ascend910-5 mindx-dl-deviceinfo-node2: @@ -41,7 +41,7 @@ mindx-dl-deviceinfo-node2: "large_model_fault_level": "PreSeparateNPU", "fault_level": "PreSeparateNPU", "fault_handling": "PreSeparateNPU", "fault_time_and_level_map": { "4C1F8608": { "fault_time": 1234567890, "fault_level": "PreSeparateNPU" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-5 + huawei.com/Ascend910-Unhealthy: Ascend910-5 --- # device info cm after public fault processor. public fault: unhealthy card is Ascend910-0, Ascend910-5 mindx-dl-deviceinfo-node2: @@ -60,4 +60,4 @@ mindx-dl-deviceinfo-node2: "large_model_fault_level": "SeparateNPU", "fault_level": "SeparateNPU", "fault_handling": "SeparateNPU", "fault_time_and_level_map": { "010001001": { "fault_time": 1739866717, "fault_level": "SeparateNPU" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-5 \ No newline at end of file + huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-5 \ No newline at end of file diff --git a/component/clusterd/testdata/resource/uce_fault_processor_test.yaml b/component/clusterd/testdata/resource/uce_fault_processor_test.yaml index 52f420ac3..9ead1d0d0 100644 --- a/component/clusterd/testdata/resource/uce_fault_processor_test.yaml +++ b/component/clusterd/testdata/resource/uce_fault_processor_test.yaml @@ -18,7 +18,7 @@ mindx-dl-deviceinfo-node1: "22222222": { "fault_time": 100000, "fault_level": "RestartBusiness" }, "33333333": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 + huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 mindx-dl-deviceinfo-node2: CmName: "mindx-dl-deviceinfo-node2" DeviceList: @@ -36,7 +36,7 @@ mindx-dl-deviceinfo-node2: { "fault_type": "CardUnhealthy", "npu_name": "Ascend910-4", "fault_code": "80E01801", "fault_time_and_level_map": { "80E01801": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 + huawei.com/Ascend910-Unhealthy: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4 --- # the result after uce processor process mindx-dl-deviceinfo-node1: @@ -50,7 +50,7 @@ mindx-dl-deviceinfo-node1: "22222222": { "fault_time": 100000, "fault_level": "RestartBusiness" }, "33333333": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-4 + huawei.com/Ascend910-Unhealthy: Ascend910-4 mindx-dl-deviceinfo-node2: CmName: "mindx-dl-deviceinfo-node2" DeviceList: @@ -60,7 +60,7 @@ mindx-dl-deviceinfo-node2: "large_model_fault_level":"RestartBusiness","fault_level":"RestartBusiness","fault_handling":"RestartBusiness", "fault_time_and_level_map": { "22222222": { "fault_time": 100000, "fault_level": "RestartBusiness" } } } ] - huawei.com/Ascend910-CardUnhealthy: Ascend910-0 + huawei.com/Ascend910-Unhealthy: Ascend910-0 --- # the output of processor.getUceDeviceOfNodes() node1: -- Gitee From 5e63b6bf2fc20166daf7971152e10f5a76a3f220 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Fri, 25 Apr 2025 11:38:15 +0800 Subject: [PATCH 18/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20notify=20mindie=20and=20grpc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/application/fault/grpc_service.go | 7 +++ .../application/fault/publish_fault_plugin.go | 16 +------ .../cmprocess/base_fault_center.go | 5 +- .../faultmanager/fault_process_center.go | 7 ++- .../jobprocess/fault_job_center.go | 48 ++++++++++++++++++- .../recover/fault_recover_service.go | 22 ++++----- .../recover/fault_recover_service_test.go | 21 +++----- .../clusterd/pkg/common/constant/type.go | 3 +- .../cmmanager/configmap_manager_test.go | 9 ++-- 9 files changed, 88 insertions(+), 50 deletions(-) diff --git a/component/clusterd/pkg/application/fault/grpc_service.go b/component/clusterd/pkg/application/fault/grpc_service.go index 3ab8695e7..e37c46290 100644 --- a/component/clusterd/pkg/application/fault/grpc_service.go +++ b/component/clusterd/pkg/application/fault/grpc_service.go @@ -18,10 +18,12 @@ package fault import ( "context" "fmt" + "reflect" "sync" "ascend-common/common-utils/hwlog" "clusterd/pkg/application/config" + "clusterd/pkg/application/faultmanager" "clusterd/pkg/common/constant" "clusterd/pkg/domain/common" "clusterd/pkg/domain/job" @@ -34,6 +36,7 @@ type FaultServer struct { faultPublisher map[string]*config.ConfigPublisher[*fault.FaultMsgSignal] lock sync.RWMutex fault.UnimplementedFaultServer + subFaultCh chan map[string]constant.JobFaultInfo } // NewFaultServer create a fault server @@ -42,6 +45,10 @@ func NewFaultServer(ctx context.Context) *FaultServer { serviceCtx: ctx, faultPublisher: make(map[string]*config.ConfigPublisher[*fault.FaultMsgSignal]), lock: sync.RWMutex{}, + subFaultCh: make(chan map[string]constant.JobFaultInfo), + } + if err := faultmanager.RegisterForJobFaultRank(server.subFaultCh, reflect.TypeOf(server).Name()); err != nil { + hwlog.RunLog.Errorf("RegisterForJobFaultRank fail") } go server.checkFaultFromFaultCenter() return server diff --git a/component/clusterd/pkg/application/fault/publish_fault_plugin.go b/component/clusterd/pkg/application/fault/publish_fault_plugin.go index 34a70195c..d609b6250 100644 --- a/component/clusterd/pkg/application/fault/publish_fault_plugin.go +++ b/component/clusterd/pkg/application/fault/publish_fault_plugin.go @@ -6,36 +6,22 @@ package fault import ( "sort" "sync" - "time" "github.com/golang/protobuf/proto" "ascend-common/common-utils/hwlog" "clusterd/pkg/application/config" - "clusterd/pkg/application/faultmanager" "clusterd/pkg/common/constant" "clusterd/pkg/domain/job" "clusterd/pkg/interface/grpc/fault" ) -const ( - publishFaultInterval = 1 -) - func (s *FaultServer) checkFaultFromFaultCenter() { - ticker := time.NewTicker(time.Duration(publishFaultInterval) * time.Second) - defer ticker.Stop() for { select { case <-s.serviceCtx.Done(): return - case <-ticker.C: - hwlog.RunLog.Debug("ticker publish fault from global center") - if faultmanager.GlobalFaultProcessCenter == nil { - hwlog.RunLog.Warnf("global center is nil, try it after %d second", publishFaultInterval) - return - } - allJobFaultInfo := faultmanager.QueryJobsFaultInfo(constant.NotHandleFault) + case allJobFaultInfo := <-s.subFaultCh: hwlog.RunLog.Debugf("global fault info: %v", allJobFaultInfo) s.checkPublishFault(allJobFaultInfo) } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go index 07ea9aec6..ce18ba288 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go @@ -4,11 +4,12 @@ package cmprocess import ( + "fmt" + "sync" + "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" "clusterd/pkg/domain/faultdomain/cmmanager" - "fmt" - "sync" ) type baseFaultCenter[T constant.ConfigMapInterface] struct { diff --git a/component/clusterd/pkg/application/faultmanager/fault_process_center.go b/component/clusterd/pkg/application/faultmanager/fault_process_center.go index 33015cedc..a39f69026 100644 --- a/component/clusterd/pkg/application/faultmanager/fault_process_center.go +++ b/component/clusterd/pkg/application/faultmanager/fault_process_center.go @@ -32,12 +32,10 @@ type faultProcessCenter struct { } func (center *faultProcessCenter) Process() { - hwlog.RunLog.Info("begin process") cmprocess.DeviceCenter.Process() cmprocess.NodeCenter.Process() cmprocess.SwitchCenter.Process() jobprocess.FaultJobCenter.Process() - hwlog.RunLog.Info("end process") } func (center *faultProcessCenter) notifyFaultCenterProcess(whichToProcess int) { @@ -150,3 +148,8 @@ func PubFaultCollector(oldPubFaultInfo, newPubFaultInfo *api.PubFaultInfo, opera } publicfault.PubFaultCollector(newPubFaultInfo) } + +// RegisterForJobFaultRank register for job fault info +func RegisterForJobFaultRank(ch chan map[string]constant.JobFaultInfo, info string) error { + return jobprocess.FaultJobCenter.Register(ch, info) +} diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go index ebe879cb0..c5eb947db 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go @@ -4,6 +4,10 @@ package jobprocess import ( + "fmt" + "sync" + + "ascend-common/common-utils/hwlog" "clusterd/pkg/application/faultmanager/cmprocess" "clusterd/pkg/application/faultmanager/jobprocess/faultrank" "clusterd/pkg/application/faultmanager/jobprocess/relationfault" @@ -14,7 +18,14 @@ import ( var FaultJobCenter *faultJobProcessCenter type faultJobProcessCenter struct { - processorList []constant.FaultProcessor + processorList []constant.FaultProcessor + subscribeChannelList []*subscriber + mutex sync.Mutex +} + +type subscriber struct { + ch chan map[string]constant.JobFaultInfo + info string } func init() { @@ -35,4 +46,39 @@ func (fJobCenter *faultJobProcessCenter) Process() { for _, processor := range fJobCenter.processorList { processor.Process(content) } + fJobCenter.notifySubscriber() +} + +// Register notify chan +func (fJobCenter *faultJobProcessCenter) Register(ch chan map[string]constant.JobFaultInfo, info string) error { + if ch == nil { + return fmt.Errorf("invalid chanel for send") + } + fJobCenter.mutex.Lock() + defer fJobCenter.mutex.Unlock() + if fJobCenter.subscribeChannelList == nil { + fJobCenter.subscribeChannelList = make([]*subscriber, 0) + } + length := len(fJobCenter.subscribeChannelList) + if length > constant.MaxFaultCenterSubscriber { + return fmt.Errorf("the number of registrants is %d, cannot add any more", length) + } + fJobCenter.subscribeChannelList = append(fJobCenter.subscribeChannelList, &subscriber{ + ch: ch, + info: info, + }) + return nil +} + +func (fJobCenter *faultJobProcessCenter) notifySubscriber() { + faultRankInfos := faultrank.JobFaultRankProcessor.GetJobFaultRankInfosFilterLevel(constant.NotHandleFault) + for _, sub := range fJobCenter.subscribeChannelList { + if sub.ch != nil { + select { + case sub.ch <- faultRankInfos: + default: + hwlog.RunLog.Warnf("notify %s fault rank failed.", sub.info) + } + } + } } diff --git a/component/clusterd/pkg/application/recover/fault_recover_service.go b/component/clusterd/pkg/application/recover/fault_recover_service.go index b055da3fc..a7261cba2 100644 --- a/component/clusterd/pkg/application/recover/fault_recover_service.go +++ b/component/clusterd/pkg/application/recover/fault_recover_service.go @@ -6,6 +6,7 @@ package recover import ( "context" "fmt" + "reflect" "sync" "time" @@ -27,6 +28,8 @@ type FaultRecoverService struct { eventCtl map[string]*EventController initJob map[string]common.JobBaseInfo lock sync.RWMutex + subFaultCh chan map[string]constant.JobFaultInfo + lastFaultInfo map[string]constant.JobFaultInfo pb.UnimplementedRecoverServer } @@ -37,6 +40,11 @@ func NewFaultRecoverService(keepAlive int, ctx context.Context) *FaultRecoverSer s.serviceCtx = ctx s.eventCtl = make(map[string]*EventController) s.initJob = make(map[string]common.JobBaseInfo) + s.subFaultCh = make(chan map[string]constant.JobFaultInfo) + s.lastFaultInfo = make(map[string]constant.JobFaultInfo) + if err := faultmanager.RegisterForJobFaultRank(s.subFaultCh, reflect.TypeOf(s).Name()); err != nil { + hwlog.RunLog.Errorf("RegisterForJobFaultRank fail") + } go s.checkFaultFromFaultCenter() return s } @@ -101,12 +109,7 @@ func (s *FaultRecoverService) dealWithJobFaultInfo(jobFaultInfoList []constant.J wg.Wait() } -func (s *FaultRecoverService) checkFault() { - if faultmanager.GlobalFaultProcessCenter == nil { - hwlog.RunLog.Warnf("global center is nil, try it after %d second", globalFaultBeaconSecond) - return - } - allJobFaultInfo := faultmanager.QueryJobsFaultInfo(constant.NotHandleFault) +func (s *FaultRecoverService) checkFault(allJobFaultInfo map[string]constant.JobFaultInfo) { var registeredJobInfo []constant.JobFaultInfo for jobId, jobFaultInfo := range allJobFaultInfo { if !s.registered(jobId) { @@ -121,15 +124,12 @@ func (s *FaultRecoverService) checkFault() { } func (s *FaultRecoverService) checkFaultFromFaultCenter() { - ticker := time.NewTicker(time.Duration(globalFaultBeaconSecond) * time.Second) - defer ticker.Stop() for { select { case <-s.serviceCtx.Done(): return - case <-ticker.C: - hwlog.RunLog.Debug("ticker check npu fault from global center") - s.checkFault() + case allJobFaultInfo := <-s.subFaultCh: + s.checkFault(allJobFaultInfo) } } } diff --git a/component/clusterd/pkg/application/recover/fault_recover_service_test.go b/component/clusterd/pkg/application/recover/fault_recover_service_test.go index 8e69e2a0c..824e4f413 100644 --- a/component/clusterd/pkg/application/recover/fault_recover_service_test.go +++ b/component/clusterd/pkg/application/recover/fault_recover_service_test.go @@ -495,25 +495,18 @@ func TestDealWithJobFaultInfo(t *testing.T) { func TestCheckFault(t *testing.T) { convey.Convey("Testing checkFault", t, func() { service := fakeService() - patches := gomonkey.ApplyFunc(faultmanager.QueryJobsFaultInfo, - func(faultLevel string) map[string]constant.JobFaultInfo { - return map[string]constant.JobFaultInfo{ - fakeJobID1: {JobId: fakeJobID1, FaultList: []constant.FaultRank{{}}}, - fakeJobID2: {JobId: fakeJobID2, FaultList: []constant.FaultRank{{}}}, - } - }).ApplyFunc(service.registered, func(jobId string) bool { - if jobId == "job1" { - return true - } - return false - }).ApplyFunc(service.dealWithJobFaultInfo, func(jobFaultInfoList []constant.JobFaultInfo) { + patches := gomonkey.ApplyFunc(service.dealWithJobFaultInfo, func(jobFaultInfoList []constant.JobFaultInfo) { convey.So(jobFaultInfoList, convey.ShouldHaveLength, 1) }) defer patches.Reset() - service.checkFault() + info := map[string]constant.JobFaultInfo{ + fakeJobID1: {JobId: fakeJobID1, FaultList: []constant.FaultRank{{}}}, + fakeJobID2: {JobId: fakeJobID2, FaultList: []constant.FaultRank{{}}}, + } + service.checkFault(info) faultmanager.GlobalFaultProcessCenter = nil - service.checkFault() + service.checkFault(info) }) } diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index b7edd8466..fa79c18a0 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -4,10 +4,11 @@ package constant import ( + "k8s.io/utils/strings/slices" + "ascend-common/api" "ascend-common/common-utils/hwlog" "clusterd/pkg/common/util" - "k8s.io/utils/strings/slices" ) // FaultTimeAndLevel of each fault code diff --git a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go index 89c051fe4..76854b083 100644 --- a/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go +++ b/component/clusterd/pkg/domain/faultdomain/cmmanager/configmap_manager_test.go @@ -4,16 +4,17 @@ package cmmanager import ( - "clusterd/pkg/common/util" - "clusterd/pkg/domain/faultdomain/collector" - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" "reflect" "sync" "testing" + "github.com/agiledragon/gomonkey/v2" + "github.com/smartystreets/goconvey/convey" + "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" + "clusterd/pkg/common/util" + "clusterd/pkg/domain/faultdomain/collector" ) func TestMain(m *testing.M) { -- Gitee From 7e02a979345601319a1c8648aba2c762a5c22a33 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Sun, 27 Apr 2025 10:34:25 +0800 Subject: [PATCH 19/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20cleancode=20ServerType->DeviceTy?= =?UTF-8?q?pe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cmprocess/publicfault/pub_fault_processor.go | 2 +- .../cmprocess/uce/uce_fault_processor.go | 2 +- .../faultrank/job_fault_rank_processor.go | 2 +- .../faultrank/job_fault_rank_processor_test.go | 6 +++--- .../relationfault/relation_fault_process_test.go | 2 +- .../relationfault/relation_fault_processor.go | 2 +- component/clusterd/pkg/common/constant/type.go | 14 +++++++------- .../clusterd/pkg/domain/faultdomain/fault_utils.go | 2 +- .../pkg/domain/faultdomain/fault_utils_test.go | 2 +- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index 3b77e63cb..a790f7d1b 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -63,7 +63,7 @@ func (p *pubFaultProcessor) faultJoin() { modified := false for _, pubFaultCache := range p.pubFaultInfo { // add public fault to fault list - pubFaultCache.FaultDevNames = convertNPUIdsToName(pubFaultCache.FaultDevIds, p.devCMInfo.ServerType) + pubFaultCache.FaultDevNames = convertNPUIdsToName(pubFaultCache.FaultDevIds, p.devCMInfo.DeviceType) for _, faultDevName := range pubFaultCache.FaultDevNames { fault := constant.DeviceFault{ FaultType: constant.PublicFaultType, diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go index ccb588b4c..2d01ee038 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go @@ -50,7 +50,7 @@ func (processor *uceFaultProcessor) initUceDeviceFromNodeAndReportInfo(jobId str } for _, deviceOfJob := range devicesOfJobOnNode.DeviceList { - deviceName := processor.nodeDeviceCmMap[nodeName].ServerType + "-" + deviceOfJob.DeviceID + deviceName := processor.nodeDeviceCmMap[nodeName].DeviceType + "-" + deviceOfJob.DeviceID uceReportInfo := collector.ReportInfoCollector.GetInfo(jobId, nodeName, deviceName) jobUceDevice := constant.UceDeviceInfo{ DeviceName: deviceName, diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go index 5d1a317d2..cc78a338d 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go @@ -83,7 +83,7 @@ func (processor *jobRankFaultInfoProcessor) findFaultRankForJob( return faultRankList } for _, deviceInfo := range devicesOfJobOnNode.DeviceList { - deviceName := advanceDeviceInfo.ServerType + "-" + deviceInfo.DeviceID + deviceName := advanceDeviceInfo.DeviceType + "-" + deviceInfo.DeviceID faultList := advanceDeviceInfo.FaultDeviceList[deviceName] podRank, podUid := pod.GetPodRankAndPodUid(jobId, deviceInfo.RankID) uceInManagementPlane := false diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go index d311b5d13..ddbefdd69 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor_test.go @@ -178,7 +178,7 @@ func testNoDevicesOnNode(processor *jobRankFaultInfoProcessor) { convey.Convey("When no devices on node", func() { nodeDeviceInfoMap := map[string]*constant.AdvanceDeviceFaultCm{ "node1": { - ServerType: "server-type", + DeviceType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{}, }, } @@ -198,7 +198,7 @@ func testUceInManagementPlane(processor *jobRankFaultInfoProcessor) { convey.Convey("When UCE fault in management plane", func() { nodeDeviceInfoMap := map[string]*constant.AdvanceDeviceFaultCm{ "node1": { - ServerType: "server-type", + DeviceType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{ "server-type-device1": { {FaultCode: constant.UceFaultCode, FaultLevel: constant.RestartBusiness}, @@ -232,7 +232,7 @@ func testUceInBusinessPlane(processor *jobRankFaultInfoProcessor) { convey.Convey("When UCE fault in business plane", func() { nodeDeviceInfoMap := map[string]*constant.AdvanceDeviceFaultCm{ "node1": { - ServerType: "server-type", + DeviceType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{}, }, } diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go index 3d06b5767..0bc258d11 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_process_test.go @@ -480,7 +480,7 @@ func testInitByDeviceFault(fJob *FaultJob) { convey.Convey("When initializing by device fault", func() { cardName := "server-type-device1" nodeFaultInfo := &constant.AdvanceDeviceFaultCm{ - ServerType: "server-type", + DeviceType: "server-type", FaultDeviceList: map[string][]constant.DeviceFault{ cardName: { { diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go index d34eff099..2195e835e 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/relationfault/relation_fault_processor.go @@ -330,7 +330,7 @@ func (fJob *FaultJob) initByDeviceFault(nodeFaultInfo *constant.AdvanceDeviceFau return } for _, deviceInfo := range serverList.DeviceList { - deviceName := nodeFaultInfo.ServerType + "-" + deviceInfo.DeviceID + deviceName := nodeFaultInfo.DeviceType + "-" + deviceInfo.DeviceID fault, ok := nodeFaultInfo.FaultDeviceList[deviceName] if !ok { continue diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index fa79c18a0..f27621479 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -230,7 +230,7 @@ type FaultProcessor interface { // AdvanceDeviceFaultCm more structure device info type AdvanceDeviceFaultCm struct { - ServerType string + DeviceType string CmName string SuperPodID int32 ServerIndex int32 @@ -256,7 +256,7 @@ func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { } return true } - return cm.ServerType == thatCm.ServerType && + return cm.DeviceType == thatCm.DeviceType && cm.CmName == thatCm.CmName && cm.SuperPodID == thatCm.SuperPodID && cm.ServerIndex == thatCm.ServerIndex && @@ -274,27 +274,27 @@ func (cm *AdvanceDeviceFaultCm) GetCmName() string { // GetRecoveringKey return cm RecoveringKey func (cm *AdvanceDeviceFaultCm) GetRecoveringKey() string { - return api.ResourceNamePrefix + cm.ServerType + CmRecoveringSuffix + return api.ResourceNamePrefix + cm.DeviceType + CmRecoveringSuffix } // GetCardUnHealthyKey return cm CardUnHealthyKey func (cm *AdvanceDeviceFaultCm) GetCardUnHealthyKey() string { - return api.ResourceNamePrefix + cm.ServerType + CmCardUnhealthySuffix + return api.ResourceNamePrefix + cm.DeviceType + CmCardUnhealthySuffix } // GetNetworkUnhealthyKey return cm NetworkUnhealthyKey func (cm *AdvanceDeviceFaultCm) GetNetworkUnhealthyKey() string { - return api.ResourceNamePrefix + cm.ServerType + CmCardNetworkUnhealthySuffix + return api.ResourceNamePrefix + cm.DeviceType + CmCardNetworkUnhealthySuffix } // GetFaultDeviceListKey return cm FaultDeviceListKey func (cm *AdvanceDeviceFaultCm) GetFaultDeviceListKey() string { - return api.ResourceNamePrefix + cm.ServerType + CmFaultListSuffix + return api.ResourceNamePrefix + cm.DeviceType + CmFaultListSuffix } // GetAvailableDeviceListKey return cm AvailableDeviceListKey func (cm *AdvanceDeviceFaultCm) GetAvailableDeviceListKey() string { - return api.ResourceNamePrefix + cm.ServerType + return api.ResourceNamePrefix + cm.DeviceType } // InformerCmItem informer configmap item of queue or buffer diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index fdc0d46b9..de22d15d0 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -86,7 +86,7 @@ func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFau SuperPodID: devInfo.SuperPodID, ServerIndex: devInfo.ServerIndex, UpdateTime: devInfo.UpdateTime, - ServerType: GetDeviceType(devInfo), + DeviceType: GetDeviceType(devInfo), } advanceDeviceCm.FaultDeviceList = getFaultListInfo(devInfo) advanceDeviceCm.NetworkUnhealthy = getNetworkUnhealthyCardList(devInfo) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index ec27255bc..3e7b54046 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -599,7 +599,7 @@ func TestAdvanceFaultCmToOriginalFaultCm(t *testing.T) { node1 := "node1" node2 := "node2" mockAdvanceCm1 := &constant.AdvanceDeviceFaultCm{ - ServerType: "", + DeviceType: "", CmName: "CmName-" + node1, SuperPodID: 0, ServerIndex: 0, -- Gitee From bf759e9bf4a52a288144b4b50ec1d61b109e0cea Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Sun, 27 Apr 2025 14:54:37 +0800 Subject: [PATCH 20/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20cleancode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/application/fault/grpc_service.go | 8 +- .../application/fault/publish_fault_plugin.go | 6 +- .../cmprocess/uce/uce_fault_processor.go | 8 +- .../uce_accompany_fault_processor.go | 4 +- .../uce_accompany_fault_processor_test.go | 6 +- .../faultmanager/fault_process_center.go | 4 +- .../jobprocess/fault_job_center.go | 24 +++--- .../faultrank/job_fault_rank_processor.go | 4 +- .../recover/fault_recover_service.go | 10 +-- .../clusterd/pkg/common/constant/type.go | 34 ++++++++- component/clusterd/pkg/common/util/util.go | 28 ------- .../pkg/domain/faultdomain/fault_utils.go | 62 +++++++-------- .../domain/faultdomain/fault_utils_test.go | 76 +++++++++---------- 13 files changed, 136 insertions(+), 138 deletions(-) diff --git a/component/clusterd/pkg/application/fault/grpc_service.go b/component/clusterd/pkg/application/fault/grpc_service.go index e37c46290..0fe83f6f1 100644 --- a/component/clusterd/pkg/application/fault/grpc_service.go +++ b/component/clusterd/pkg/application/fault/grpc_service.go @@ -36,7 +36,7 @@ type FaultServer struct { faultPublisher map[string]*config.ConfigPublisher[*fault.FaultMsgSignal] lock sync.RWMutex fault.UnimplementedFaultServer - subFaultCh chan map[string]constant.JobFaultInfo + faultCh chan map[string]constant.JobFaultInfo } // NewFaultServer create a fault server @@ -45,10 +45,10 @@ func NewFaultServer(ctx context.Context) *FaultServer { serviceCtx: ctx, faultPublisher: make(map[string]*config.ConfigPublisher[*fault.FaultMsgSignal]), lock: sync.RWMutex{}, - subFaultCh: make(chan map[string]constant.JobFaultInfo), + faultCh: make(chan map[string]constant.JobFaultInfo, 5), } - if err := faultmanager.RegisterForJobFaultRank(server.subFaultCh, reflect.TypeOf(server).Name()); err != nil { - hwlog.RunLog.Errorf("RegisterForJobFaultRank fail") + if err := faultmanager.RegisterForJobFaultRank(server.faultCh, reflect.TypeOf(server).Name()); err != nil { + hwlog.RunLog.Error("RegisterForJobFaultRank fail") } go server.checkFaultFromFaultCenter() return server diff --git a/component/clusterd/pkg/application/fault/publish_fault_plugin.go b/component/clusterd/pkg/application/fault/publish_fault_plugin.go index d609b6250..f5120a6ba 100644 --- a/component/clusterd/pkg/application/fault/publish_fault_plugin.go +++ b/component/clusterd/pkg/application/fault/publish_fault_plugin.go @@ -21,7 +21,11 @@ func (s *FaultServer) checkFaultFromFaultCenter() { select { case <-s.serviceCtx.Done(): return - case allJobFaultInfo := <-s.subFaultCh: + case allJobFaultInfo, ok := <-s.faultCh: + if !ok { + hwlog.RunLog.Info("faultCh has been closed.") + return + } hwlog.RunLog.Debugf("global fault info: %v", allJobFaultInfo) s.checkPublishFault(allJobFaultInfo) } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go index 2d01ee038..fca9afc40 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go @@ -80,20 +80,20 @@ func (processor *uceFaultProcessor) Process(info any) any { processor.jobServerInfoMap = job.GetJobServerInfoMap() processor.nodeDeviceCmMap = processContent.AllConfigmap - hwlog.RunLog.Debugf("current nodeDeviceCmMap %s", util.ObjToString(processor.nodeDeviceCmMap)) + hwlog.RunLog.Debugf("current nodeDeviceCmMap %v", processor.nodeDeviceCmMap) processor.uceDeviceOfNode = processor.getUceDeviceOfNodes() - hwlog.RunLog.Debugf("current uceDeviceOfNode %s", util.ObjToString(processor.uceDeviceOfNode)) + hwlog.RunLog.Debugf("current uceDeviceOfNode %v", processor.uceDeviceOfNode) processor.uceDevicesOfUceJob = processor.getUceDevicesForUceTolerateJobs() - hwlog.RunLog.Debugf("current uceDevicesOfUceJob %s", util.ObjToString(processor.uceDevicesOfUceJob)) + hwlog.RunLog.Debugf("current uceDevicesOfUceJob %v", processor.uceDevicesOfUceJob) currentTime := time.Now().UnixMilli() hwlog.RunLog.Debugf("currentTime %d", currentTime) processor.processUceFaultInfo(currentTime) - hwlog.RunLog.Debugf("result deviceInfos %s", util.ObjToString(processContent.AllConfigmap)) + hwlog.RunLog.Debugf("result deviceInfos %v", processContent.AllConfigmap) return processContent } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go index 5bdc046e6..33c00371c 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go @@ -179,10 +179,10 @@ func (processor *uceAccompanyFaultProcessor) Process(info any) any { return info } processor.deviceCmForNodeMap = processContent.AllConfigmap - hwlog.RunLog.Debugf("current deviceInfos: %s", util.ObjToString(processContent.AllConfigmap)) + hwlog.RunLog.Debugf("current deviceInfos: %v", processContent.AllConfigmap) processor.uceAccompanyFaultInQue() - hwlog.RunLog.Debugf("current uceAccompanyFaultQue: %s", util.ObjToString(processor.uceAccompanyFaultQue)) + hwlog.RunLog.Debugf("current uceAccompanyFaultQue: %v", processor.uceAccompanyFaultQue) currentTime := time.Now().UnixMilli() processor.filterFaultInfos(currentTime) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go index 4d2a927d6..28dc25039 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go @@ -74,12 +74,12 @@ func TestUceAccompanyFaultProcessorProcessE2E(t *testing.T) { mockNow.Reset() mockUnixMilli.Reset() }() - resultContent := UceAccompanyProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) + res := UceAccompanyProcessor.Process(content).(constant.OneConfigmapContent[*constant.AdvanceDeviceFaultCm]) exp := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expectProcessedDeviceInfos) - if !reflect.DeepEqual(resultContent.AllConfigmap, exp) { + if !reflect.DeepEqual(res.AllConfigmap, exp) { t.Errorf("result:\n%v\nwant:\n%v", - util.ObjToString(resultContent.AllConfigmap), + util.ObjToString(res.AllConfigmap), util.ObjToString(exp)) } diff --git a/component/clusterd/pkg/application/faultmanager/fault_process_center.go b/component/clusterd/pkg/application/faultmanager/fault_process_center.go index a39f69026..3f1485436 100644 --- a/component/clusterd/pkg/application/faultmanager/fault_process_center.go +++ b/component/clusterd/pkg/application/faultmanager/fault_process_center.go @@ -150,6 +150,6 @@ func PubFaultCollector(oldPubFaultInfo, newPubFaultInfo *api.PubFaultInfo, opera } // RegisterForJobFaultRank register for job fault info -func RegisterForJobFaultRank(ch chan map[string]constant.JobFaultInfo, info string) error { - return jobprocess.FaultJobCenter.Register(ch, info) +func RegisterForJobFaultRank(ch chan map[string]constant.JobFaultInfo, src string) error { + return jobprocess.FaultJobCenter.Register(ch, src) } diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go index c5eb947db..c6a03bc4f 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go @@ -24,8 +24,8 @@ type faultJobProcessCenter struct { } type subscriber struct { - ch chan map[string]constant.JobFaultInfo - info string + ch chan map[string]constant.JobFaultInfo + src string } func init() { @@ -50,7 +50,7 @@ func (fJobCenter *faultJobProcessCenter) Process() { } // Register notify chan -func (fJobCenter *faultJobProcessCenter) Register(ch chan map[string]constant.JobFaultInfo, info string) error { +func (fJobCenter *faultJobProcessCenter) Register(ch chan map[string]constant.JobFaultInfo, src string) error { if ch == nil { return fmt.Errorf("invalid chanel for send") } @@ -64,8 +64,8 @@ func (fJobCenter *faultJobProcessCenter) Register(ch chan map[string]constant.Jo return fmt.Errorf("the number of registrants is %d, cannot add any more", length) } fJobCenter.subscribeChannelList = append(fJobCenter.subscribeChannelList, &subscriber{ - ch: ch, - info: info, + ch: ch, + src: src, }) return nil } @@ -73,12 +73,14 @@ func (fJobCenter *faultJobProcessCenter) Register(ch chan map[string]constant.Jo func (fJobCenter *faultJobProcessCenter) notifySubscriber() { faultRankInfos := faultrank.JobFaultRankProcessor.GetJobFaultRankInfosFilterLevel(constant.NotHandleFault) for _, sub := range fJobCenter.subscribeChannelList { - if sub.ch != nil { - select { - case sub.ch <- faultRankInfos: - default: - hwlog.RunLog.Warnf("notify %s fault rank failed.", sub.info) - } + if sub.ch == nil { + continue } + select { + case sub.ch <- faultRankInfos: + default: + hwlog.RunLog.Warnf("notify %s fault rank failed.", sub.src) + } + } } diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go index cc78a338d..b344c13c3 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/faultrank/job_fault_rank_processor.go @@ -78,10 +78,10 @@ func (processor *jobRankFaultInfoProcessor) findFaultRankForJob( advanceDeviceInfo *constant.AdvanceDeviceFaultCm, nodeName string, serverList map[string]constant.ServerHccl, jobId string) []constant.FaultRank { devicesOfJobOnNode, ok := serverList[nodeName] - faultRankList := make([]constant.FaultRank, 0) if advanceDeviceInfo == nil || !ok || len(devicesOfJobOnNode.DeviceList) == 0 { - return faultRankList + return make([]constant.FaultRank, 0) } + faultRankList := make([]constant.FaultRank, 0) for _, deviceInfo := range devicesOfJobOnNode.DeviceList { deviceName := advanceDeviceInfo.DeviceType + "-" + deviceInfo.DeviceID faultList := advanceDeviceInfo.FaultDeviceList[deviceName] diff --git a/component/clusterd/pkg/application/recover/fault_recover_service.go b/component/clusterd/pkg/application/recover/fault_recover_service.go index a7261cba2..9f5d0af68 100644 --- a/component/clusterd/pkg/application/recover/fault_recover_service.go +++ b/component/clusterd/pkg/application/recover/fault_recover_service.go @@ -28,8 +28,7 @@ type FaultRecoverService struct { eventCtl map[string]*EventController initJob map[string]common.JobBaseInfo lock sync.RWMutex - subFaultCh chan map[string]constant.JobFaultInfo - lastFaultInfo map[string]constant.JobFaultInfo + faultCh chan map[string]constant.JobFaultInfo pb.UnimplementedRecoverServer } @@ -40,9 +39,8 @@ func NewFaultRecoverService(keepAlive int, ctx context.Context) *FaultRecoverSer s.serviceCtx = ctx s.eventCtl = make(map[string]*EventController) s.initJob = make(map[string]common.JobBaseInfo) - s.subFaultCh = make(chan map[string]constant.JobFaultInfo) - s.lastFaultInfo = make(map[string]constant.JobFaultInfo) - if err := faultmanager.RegisterForJobFaultRank(s.subFaultCh, reflect.TypeOf(s).Name()); err != nil { + s.faultCh = make(chan map[string]constant.JobFaultInfo, 5) + if err := faultmanager.RegisterForJobFaultRank(s.faultCh, reflect.TypeOf(s).Name()); err != nil { hwlog.RunLog.Errorf("RegisterForJobFaultRank fail") } go s.checkFaultFromFaultCenter() @@ -128,7 +126,7 @@ func (s *FaultRecoverService) checkFaultFromFaultCenter() { select { case <-s.serviceCtx.Done(): return - case allJobFaultInfo := <-s.subFaultCh: + case allJobFaultInfo := <-s.faultCh: s.checkFault(allJobFaultInfo) } } diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index f27621479..5f439ddb2 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -4,11 +4,12 @@ package constant import ( + "maps" + "k8s.io/utils/strings/slices" "ascend-common/api" "ascend-common/common-utils/hwlog" - "clusterd/pkg/common/util" ) // FaultTimeAndLevel of each fault code @@ -37,7 +38,7 @@ func equalDeviceFault(one, other *DeviceFault) bool { one.FaultLevel == other.FaultLevel && one.FaultHandling == other.FaultHandling && one.FaultCode == other.FaultCode && - util.MapEqual(one.FaultTimeAndLevelMap, other.FaultTimeAndLevelMap) + maps.Equal(one.FaultTimeAndLevelMap, other.FaultTimeAndLevelMap) } // NodeInfoCM the config map struct of node info @@ -244,7 +245,14 @@ type AdvanceDeviceFaultCm struct { // IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { - thatCm := another.(*AdvanceDeviceFaultCm) + if cm == nil { + hwlog.RunLog.Error("cm is nil") + return false + } + thatCm, ok := another.(*AdvanceDeviceFaultCm) + if !ok { + return false + } eq := func(faultListOne []DeviceFault, faultListOther []DeviceFault) bool { if len(faultListOne) != len(faultListOther) { return false @@ -264,36 +272,54 @@ func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { slices.Equal(cm.Recovering, thatCm.Recovering) && slices.Equal(cm.CardUnHealthy, thatCm.CardUnHealthy) && slices.Equal(cm.NetworkUnhealthy, thatCm.NetworkUnhealthy) && - util.MapEqualFunc(cm.FaultDeviceList, thatCm.FaultDeviceList, eq) + maps.EqualFunc(cm.FaultDeviceList, thatCm.FaultDeviceList, eq) } // GetCmName return cm name func (cm *AdvanceDeviceFaultCm) GetCmName() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } return cm.CmName } // GetRecoveringKey return cm RecoveringKey func (cm *AdvanceDeviceFaultCm) GetRecoveringKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } return api.ResourceNamePrefix + cm.DeviceType + CmRecoveringSuffix } // GetCardUnHealthyKey return cm CardUnHealthyKey func (cm *AdvanceDeviceFaultCm) GetCardUnHealthyKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } return api.ResourceNamePrefix + cm.DeviceType + CmCardUnhealthySuffix } // GetNetworkUnhealthyKey return cm NetworkUnhealthyKey func (cm *AdvanceDeviceFaultCm) GetNetworkUnhealthyKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } return api.ResourceNamePrefix + cm.DeviceType + CmCardNetworkUnhealthySuffix } // GetFaultDeviceListKey return cm FaultDeviceListKey func (cm *AdvanceDeviceFaultCm) GetFaultDeviceListKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } return api.ResourceNamePrefix + cm.DeviceType + CmFaultListSuffix } // GetAvailableDeviceListKey return cm AvailableDeviceListKey func (cm *AdvanceDeviceFaultCm) GetAvailableDeviceListKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } return api.ResourceNamePrefix + cm.DeviceType } diff --git a/component/clusterd/pkg/common/util/util.go b/component/clusterd/pkg/common/util/util.go index 313b8c828..72f3eb2c4 100644 --- a/component/clusterd/pkg/common/util/util.go +++ b/component/clusterd/pkg/common/util/util.go @@ -180,31 +180,3 @@ func RemoveDuplicates[T comparable](slice []T) []T { } return result } - -// MapEqual reports whether two maps contain the same key/value pairs. -// Values are compared using ==. -func MapEqual[M1, M2 ~map[K]V, K, V comparable](m1 M1, m2 M2) bool { - if len(m1) != len(m2) { - return false - } - for k, v1 := range m1 { - if v2, ok := m2[k]; !ok || v1 != v2 { - return false - } - } - return true -} - -// MapEqualFunc is like Equal, but compares values using eq. -// Keys are still compared with ==. -func MapEqualFunc[M1 ~map[K]V1, M2 ~map[K]V2, K comparable, V1, V2 any](m1 M1, m2 M2, eq func(V1, V2) bool) bool { - if len(m1) != len(m2) { - return false - } - for k, v1 := range m1 { - if v2, ok := m2[k]; !ok || !eq(v1, v2) { - return false - } - } - return true -} diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index de22d15d0..ab0ab18ad 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -74,9 +74,10 @@ func GetAdvanceFaultForNode[T constant.ConfigMapInterface](cmForNode T) constant return cm case *constant.AdvanceDeviceFaultCm: return cm + default: + hwlog.RunLog.Errorf("cmForNode type is not support.") + return nil } - hwlog.RunLog.Errorf("cmForNode type is not support.") - return nil } // GetAdvanceDeviceCm return more usable device cm @@ -98,30 +99,29 @@ func GetAdvanceDeviceCm(devInfo *constant.DeviceInfo) *constant.AdvanceDeviceFau func getFaultListInfo(devInfo *constant.DeviceInfo) map[string][]constant.DeviceFault { _, faultList := getFaultListString(devInfo) - if len(faultList) > 0 { - var devicesFault []constant.DeviceFault - err := json.Unmarshal([]byte(faultList), &devicesFault) - if err != nil { - hwlog.RunLog.Errorf("get fault list for node %v failed. "+ - "Json unmarshall exception: %v", devInfo.CmName, err) - return make(map[string][]constant.DeviceFault) - } - deviceFaultMap := make(map[string][]constant.DeviceFault) - for _, deviceFault := range devicesFault { - if _, ok := deviceFaultMap[deviceFault.NPUName]; !ok { - deviceFaultMap[deviceFault.NPUName] = make([]constant.DeviceFault, 0) - } - hwlog.RunLog.Debugf("device fault: %s of cm %s, time: %s", - util.ObjToString(deviceFault), devInfo.CmName, util.ReadableMsTime(devInfo.UpdateTime)) - // device plugin may merge multiple fault codes in one string - deviceFaults := splitDeviceFault(deviceFault, CmNameToNodeName(devInfo.CmName)) - deviceFaultMap[deviceFault.NPUName] = append(deviceFaultMap[deviceFault.NPUName], deviceFaults...) - } - return deviceFaultMap - } else { + if len(faultList) == 0 { hwlog.RunLog.Infof("get fault list for node %v failed. fault list does not exist", devInfo.CmName) return make(map[string][]constant.DeviceFault) } + var devicesFault []constant.DeviceFault + err := json.Unmarshal([]byte(faultList), &devicesFault) + if err != nil { + hwlog.RunLog.Errorf("get fault list for node %v failed. "+ + "Json unmarshall exception: %v", devInfo.CmName, err) + return make(map[string][]constant.DeviceFault) + } + deviceFaultMap := make(map[string][]constant.DeviceFault) + for _, deviceFault := range devicesFault { + if _, ok := deviceFaultMap[deviceFault.NPUName]; !ok { + deviceFaultMap[deviceFault.NPUName] = make([]constant.DeviceFault, 0) + } + hwlog.RunLog.Debugf("device fault: %s of cm %s, time: %s", + util.ObjToString(deviceFault), devInfo.CmName, util.ReadableMsTime(devInfo.UpdateTime)) + // device plugin may merge multiple fault codes in one string + deviceFaults := splitDeviceFault(deviceFault, CmNameToNodeName(devInfo.CmName)) + deviceFaultMap[deviceFault.NPUName] = append(deviceFaultMap[deviceFault.NPUName], deviceFaults...) + } + return deviceFaultMap } func getCardUnHealthy(devInfo *constant.DeviceInfo) []string { @@ -303,26 +303,27 @@ func AddFaultIntoFaultMap(faultMap map[string][]constant.DeviceFault, func AdvanceFaultCmToOriginalFaultCm[U, T constant.ConfigMapInterface](advanceFaultCm map[string]T) map[string]U { orgFaultCm := make(map[string]U) for _, advanceCmForNode := range advanceFaultCm { - orgFaultCm[advanceCmForNode.GetCmName()] = AdvanceFaultCmToOriginalCmForNode(advanceCmForNode).(U) + orgFaultCm[advanceCmForNode.GetCmName()] = AdvanceCmToOriginalCmForNode(advanceCmForNode).(U) } return orgFaultCm } -func AdvanceFaultCmToOriginalCmForNode[T constant.ConfigMapInterface](advanceCmForNode T) constant.ConfigMapInterface { +func AdvanceCmToOriginalCmForNode[T constant.ConfigMapInterface](advanceCmForNode T) constant.ConfigMapInterface { switch cm := any(advanceCmForNode).(type) { case *constant.AdvanceDeviceFaultCm: - return AdvanceDeviceFaultCmToOriginalCmForNode(cm) + return AdvanceDevCmToOrigCmForNode(cm) case *constant.SwitchInfo: return cm case *constant.NodeInfo: return cm + default: + hwlog.RunLog.Errorf("AdvanceFaultCmToOriginalCmForNode don't support this type.") + return nil } - hwlog.RunLog.Errorf("AdvanceFaultCmToOriginalCmForNode don't support this type.") - return nil } -// AdvanceDeviceFaultCmToOriginalCmForNode convert advance device cm to original format -func AdvanceDeviceFaultCmToOriginalCmForNode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) *constant.DeviceInfo { +// AdvanceDevCmToOrigCmForNode convert advance device cm to original format +func AdvanceDevCmToOrigCmForNode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) *constant.DeviceInfo { orgDeviceCm := &constant.DeviceInfo{ DeviceInfoNoName: constant.DeviceInfoNoName{ DeviceList: make(map[string]string), @@ -409,6 +410,7 @@ func mergeCode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { } } +// FixUnhealthyInfo fix the CardUnHealthy/NetworkUnhealthy/AvailableDevice list func FixUnhealthyInfo(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { newCardUnHealthy := make([]string, 0) newNetworkUnhealthy := make([]string, 0) diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index 3e7b54046..1676aa037 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -653,24 +653,22 @@ func TestGetSortedKeys(t *testing.T) { } func TestCompareFaultTimeAndLevel(t *testing.T) { - convey.Convey("Test compareFaultTimeAndLevel", t, func() { - convey.Convey("should compare by FaultTime first", func() { - a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} - b := constant.FaultTimeAndLevel{FaultTime: 200, FaultLevel: "high"} - convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeLessThan, 0) - }) + convey.Convey("should compare by FaultTime first", t, func() { + a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} + b := constant.FaultTimeAndLevel{FaultTime: 200, FaultLevel: "high"} + convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeLessThan, 0) + }) - convey.Convey("should compare by FaultLevel when FaultTime equal", func() { - a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} - b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "high"} - convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeGreaterThan, 0) - }) + convey.Convey("should compare by FaultLevel when FaultTime equal", t, func() { + a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} + b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "high"} + convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeGreaterThan, 0) + }) - convey.Convey("should return 0 when both equal", func() { - a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} - b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} - convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldEqual, 0) - }) + convey.Convey("should return 0 when both equal", t, func() { + a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} + b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} + convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldEqual, 0) }) } @@ -719,34 +717,30 @@ func TestCompareDeviceFault(t *testing.T) { } func TestSortDataForAdvanceDeviceInfo(t *testing.T) { - convey.Convey("Test SortDataForAdvanceDeviceInfo", t, func() { - // Mock dependencies - - deviceInfo := &constant.AdvanceDeviceFaultCm{ - AvailableDeviceList: []string{"d3", "d1", "d2"}, - CardUnHealthy: []string{"c2", "c1"}, - NetworkUnhealthy: []string{"n2", "n1"}, - Recovering: []string{"r2", "r1"}, - FaultDeviceList: map[string][]constant.DeviceFault{ - "list1": {{FaultType: "typeB"}, {FaultType: "typeA"}}, - }, - } + deviceInfo := &constant.AdvanceDeviceFaultCm{ + AvailableDeviceList: []string{"d3", "d1", "d2"}, + CardUnHealthy: []string{"c2", "c1"}, + NetworkUnhealthy: []string{"n2", "n1"}, + Recovering: []string{"r2", "r1"}, + FaultDeviceList: map[string][]constant.DeviceFault{ + "list1": {{FaultType: "typeB"}, {FaultType: "typeA"}}, + }, + } - expDeviceInfo := &constant.AdvanceDeviceFaultCm{ - AvailableDeviceList: []string{"d1", "d2", "d3"}, - CardUnHealthy: []string{"c1", "c2"}, - NetworkUnhealthy: []string{"n1", "n2"}, - Recovering: []string{"r1", "r2"}, - FaultDeviceList: map[string][]constant.DeviceFault{ - "list1": {{FaultType: "typeA"}, {FaultType: "typeB"}}, - }, - } + expDeviceInfo := &constant.AdvanceDeviceFaultCm{ + AvailableDeviceList: []string{"d1", "d2", "d3"}, + CardUnHealthy: []string{"c1", "c2"}, + NetworkUnhealthy: []string{"n1", "n2"}, + Recovering: []string{"r1", "r2"}, + FaultDeviceList: map[string][]constant.DeviceFault{ + "list1": {{FaultType: "typeA"}, {FaultType: "typeB"}}, + }, + } - SortDataForAdvanceDeviceInfo(deviceInfo) + SortDataForAdvanceDeviceInfo(deviceInfo) - convey.Convey("should be equal", func() { - convey.So(deviceInfo, convey.ShouldResemble, expDeviceInfo) - }) + convey.Convey("should be equal", t, func() { + convey.So(deviceInfo, convey.ShouldResemble, expDeviceInfo) }) } -- Gitee From aa11dd79f83ef658512d9590c42e639177cdf971 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Sun, 27 Apr 2025 15:20:34 +0800 Subject: [PATCH 21/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20move=20funcs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jobprocess/fault_job_center.go | 1 - .../clusterd/pkg/common/constant/type.go | 204 ----------------- .../clusterd/pkg/common/constant/type_func.go | 208 ++++++++++++++++++ .../domain/faultdomain/fault_utils_test.go | 4 +- 4 files changed, 209 insertions(+), 208 deletions(-) create mode 100644 component/clusterd/pkg/common/constant/type_func.go diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go index c6a03bc4f..3e1211016 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go @@ -81,6 +81,5 @@ func (fJobCenter *faultJobProcessCenter) notifySubscriber() { default: hwlog.RunLog.Warnf("notify %s fault rank failed.", sub.src) } - } } diff --git a/component/clusterd/pkg/common/constant/type.go b/component/clusterd/pkg/common/constant/type.go index 5f439ddb2..18b7bd2cc 100644 --- a/component/clusterd/pkg/common/constant/type.go +++ b/component/clusterd/pkg/common/constant/type.go @@ -3,15 +3,6 @@ // Package constant a series of para package constant -import ( - "maps" - - "k8s.io/utils/strings/slices" - - "ascend-common/api" - "ascend-common/common-utils/hwlog" -) - // FaultTimeAndLevel of each fault code // some fault may not have accurate fault time and level, // for example: duration fault use current time as `FaultTime` @@ -31,16 +22,6 @@ type DeviceFault struct { FaultTimeAndLevelMap map[string]FaultTimeAndLevel `json:"fault_time_and_level_map"` } -func equalDeviceFault(one, other *DeviceFault) bool { - return one.FaultType == other.FaultType && - one.NPUName == other.NPUName && - one.LargeModelFaultLevel == other.LargeModelFaultLevel && - one.FaultLevel == other.FaultLevel && - one.FaultHandling == other.FaultHandling && - one.FaultCode == other.FaultCode && - maps.Equal(one.FaultTimeAndLevelMap, other.FaultTimeAndLevelMap) -} - // NodeInfoCM the config map struct of node info type NodeInfoCM struct { NodeInfo NodeInfoNoName @@ -243,86 +224,6 @@ type AdvanceDeviceFaultCm struct { UpdateTime int64 } -// IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime -func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { - if cm == nil { - hwlog.RunLog.Error("cm is nil") - return false - } - thatCm, ok := another.(*AdvanceDeviceFaultCm) - if !ok { - return false - } - eq := func(faultListOne []DeviceFault, faultListOther []DeviceFault) bool { - if len(faultListOne) != len(faultListOther) { - return false - } - for i, fault := range faultListOne { - if !equalDeviceFault(&fault, &faultListOther[i]) { - return false - } - } - return true - } - return cm.DeviceType == thatCm.DeviceType && - cm.CmName == thatCm.CmName && - cm.SuperPodID == thatCm.SuperPodID && - cm.ServerIndex == thatCm.ServerIndex && - slices.Equal(cm.AvailableDeviceList, thatCm.AvailableDeviceList) && - slices.Equal(cm.Recovering, thatCm.Recovering) && - slices.Equal(cm.CardUnHealthy, thatCm.CardUnHealthy) && - slices.Equal(cm.NetworkUnhealthy, thatCm.NetworkUnhealthy) && - maps.EqualFunc(cm.FaultDeviceList, thatCm.FaultDeviceList, eq) -} - -// GetCmName return cm name -func (cm *AdvanceDeviceFaultCm) GetCmName() string { - if cm == nil { - hwlog.RunLog.Error("cm is nil") - } - return cm.CmName -} - -// GetRecoveringKey return cm RecoveringKey -func (cm *AdvanceDeviceFaultCm) GetRecoveringKey() string { - if cm == nil { - hwlog.RunLog.Error("cm is nil") - } - return api.ResourceNamePrefix + cm.DeviceType + CmRecoveringSuffix -} - -// GetCardUnHealthyKey return cm CardUnHealthyKey -func (cm *AdvanceDeviceFaultCm) GetCardUnHealthyKey() string { - if cm == nil { - hwlog.RunLog.Error("cm is nil") - } - return api.ResourceNamePrefix + cm.DeviceType + CmCardUnhealthySuffix -} - -// GetNetworkUnhealthyKey return cm NetworkUnhealthyKey -func (cm *AdvanceDeviceFaultCm) GetNetworkUnhealthyKey() string { - if cm == nil { - hwlog.RunLog.Error("cm is nil") - } - return api.ResourceNamePrefix + cm.DeviceType + CmCardNetworkUnhealthySuffix -} - -// GetFaultDeviceListKey return cm FaultDeviceListKey -func (cm *AdvanceDeviceFaultCm) GetFaultDeviceListKey() string { - if cm == nil { - hwlog.RunLog.Error("cm is nil") - } - return api.ResourceNamePrefix + cm.DeviceType + CmFaultListSuffix -} - -// GetAvailableDeviceListKey return cm AvailableDeviceListKey -func (cm *AdvanceDeviceFaultCm) GetAvailableDeviceListKey() string { - if cm == nil { - hwlog.RunLog.Error("cm is nil") - } - return api.ResourceNamePrefix + cm.DeviceType -} - // InformerCmItem informer configmap item of queue or buffer type InformerCmItem[T ConfigMapInterface] struct { IsAdd bool @@ -348,111 +249,6 @@ type ConfigMapInterface interface { IsSame(another ConfigMapInterface) bool } -// GetCmName get configmap name of device info -func (cm *DeviceInfo) GetCmName() string { - return cm.CmName -} - -// GetCmName get configmap name of switch info -func (cm *SwitchInfo) GetCmName() string { - return cm.CmName -} - -// GetCmName get configmap name of node info -func (cm *NodeInfo) GetCmName() string { - return cm.CmName -} - -// IsSame compare with another cm -func (cm *DeviceInfo) IsSame(another ConfigMapInterface) bool { - anotherDeviceInfo, ok := another.(*DeviceInfo) - if !ok { - hwlog.RunLog.Warnf("compare with cm which is not DeviceInfo") - return false - } - return !DeviceInfoBusinessDataIsNotEqual(cm, anotherDeviceInfo) -} - -// IsSame compare with another cm -func (cm *SwitchInfo) IsSame(another ConfigMapInterface) bool { - anotherSwitchInfo, ok := another.(*SwitchInfo) - if !ok { - hwlog.RunLog.Warnf("compare with cm which is not SwitchInfo") - return false - } - return !SwitchInfoBusinessDataIsNotEqual(cm, anotherSwitchInfo) -} - -// IsSame compare with another cm -func (cm *NodeInfo) IsSame(another ConfigMapInterface) bool { - anotherNodeInfo, ok := another.(*NodeInfo) - if !ok { - hwlog.RunLog.Warnf("compare with cm which is not NodeInfo") - return false - } - return !NodeInfoBusinessDataIsNotEqual(cm, anotherNodeInfo) -} - -// DeviceInfoBusinessDataIsNotEqual determine the business data is not equal -func DeviceInfoBusinessDataIsNotEqual(oldDevInfo *DeviceInfo, devInfo *DeviceInfo) bool { - if oldDevInfo == nil && devInfo == nil { - hwlog.RunLog.Debug("both oldDevInfo and devInfo are nil") - return false - } - if oldDevInfo == nil || devInfo == nil { - hwlog.RunLog.Debug("one of oldDevInfo and devInfo is not empty, and the other is empty") - return true - } - if len(oldDevInfo.DeviceList) != len(devInfo.DeviceList) { - hwlog.RunLog.Debug("the length of the deviceList of oldDevInfo is not equal to that of the deviceList of devInfo") - return true - } - for nKey, nValue := range oldDevInfo.DeviceList { - oValue, exists := devInfo.DeviceList[nKey] - if !exists || nValue != oValue { - hwlog.RunLog.Debug("neither oldDevInfo nor devInfo is empty, but oldDevInfo is not equal to devInfo") - return true - } - } - hwlog.RunLog.Debug("oldDevInfo is equal to devInfo") - return false -} - -// SwitchInfoBusinessDataIsNotEqual judge is the faultcode and fault level is the same as known, if is not same returns true -func SwitchInfoBusinessDataIsNotEqual(oldSwitch, newSwitch *SwitchInfo) bool { - if oldSwitch == nil && newSwitch == nil { - return false - } - if (oldSwitch != nil && newSwitch == nil) || (oldSwitch == nil && newSwitch != nil) { - return true - } - if newSwitch.FaultLevel != oldSwitch.FaultLevel || newSwitch.NodeStatus != oldSwitch.NodeStatus || - len(newSwitch.FaultCode) != len(oldSwitch.FaultCode) { - return true - } - hwlog.RunLog.Debug("oldSwitch is equal to newSwitch") - return false -} - -// NodeInfoBusinessDataIsNotEqual determine the business data is not equal -func NodeInfoBusinessDataIsNotEqual(oldNodeInfo *NodeInfo, newNodeInfo *NodeInfo) bool { - if oldNodeInfo == nil && newNodeInfo == nil { - hwlog.RunLog.Debug("both oldNodeInfo and newNodeInfo are nil") - return false - } - if oldNodeInfo == nil || newNodeInfo == nil { - hwlog.RunLog.Debug("one of oldNodeInfo and newNodeInfo is not empty, and the other is empty") - return true - } - if oldNodeInfo.NodeStatus != newNodeInfo.NodeStatus || - len(oldNodeInfo.FaultDevList) != len(newNodeInfo.FaultDevList) { - hwlog.RunLog.Debug("neither oldNodeInfo nor newNodeInfo is empty, but oldNodeInfo is not equal to newNodeInfo") - return true - } - hwlog.RunLog.Debug("oldNodeInfo is equal to newNodeInfo") - return false -} - // FaultRank defines the structure for storing fault rank information. // It includes the rank ID and fault code. type FaultRank struct { diff --git a/component/clusterd/pkg/common/constant/type_func.go b/component/clusterd/pkg/common/constant/type_func.go new file mode 100644 index 000000000..ab3a0f7d8 --- /dev/null +++ b/component/clusterd/pkg/common/constant/type_func.go @@ -0,0 +1,208 @@ +// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + +// Package constant a series of para +package constant + +import ( + "maps" + + "k8s.io/utils/strings/slices" + + "ascend-common/api" + "ascend-common/common-utils/hwlog" +) + +func equalDeviceFault(one, other *DeviceFault) bool { + return one.FaultType == other.FaultType && + one.NPUName == other.NPUName && + one.LargeModelFaultLevel == other.LargeModelFaultLevel && + one.FaultLevel == other.FaultLevel && + one.FaultHandling == other.FaultHandling && + one.FaultCode == other.FaultCode && + maps.Equal(one.FaultTimeAndLevelMap, other.FaultTimeAndLevelMap) +} + +// IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime +func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + return false + } + thatCm, ok := another.(*AdvanceDeviceFaultCm) + if !ok { + return false + } + eq := func(faultListOne []DeviceFault, faultListOther []DeviceFault) bool { + if len(faultListOne) != len(faultListOther) { + return false + } + for i, fault := range faultListOne { + if !equalDeviceFault(&fault, &faultListOther[i]) { + return false + } + } + return true + } + return cm.DeviceType == thatCm.DeviceType && + cm.CmName == thatCm.CmName && + cm.SuperPodID == thatCm.SuperPodID && + cm.ServerIndex == thatCm.ServerIndex && + slices.Equal(cm.AvailableDeviceList, thatCm.AvailableDeviceList) && + slices.Equal(cm.Recovering, thatCm.Recovering) && + slices.Equal(cm.CardUnHealthy, thatCm.CardUnHealthy) && + slices.Equal(cm.NetworkUnhealthy, thatCm.NetworkUnhealthy) && + maps.EqualFunc(cm.FaultDeviceList, thatCm.FaultDeviceList, eq) +} + +// GetCmName return cm name +func (cm *AdvanceDeviceFaultCm) GetCmName() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } + return cm.CmName +} + +// GetRecoveringKey return cm RecoveringKey +func (cm *AdvanceDeviceFaultCm) GetRecoveringKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } + return api.ResourceNamePrefix + cm.DeviceType + CmRecoveringSuffix +} + +// GetCardUnHealthyKey return cm CardUnHealthyKey +func (cm *AdvanceDeviceFaultCm) GetCardUnHealthyKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } + return api.ResourceNamePrefix + cm.DeviceType + CmCardUnhealthySuffix +} + +// GetNetworkUnhealthyKey return cm NetworkUnhealthyKey +func (cm *AdvanceDeviceFaultCm) GetNetworkUnhealthyKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } + return api.ResourceNamePrefix + cm.DeviceType + CmCardNetworkUnhealthySuffix +} + +// GetFaultDeviceListKey return cm FaultDeviceListKey +func (cm *AdvanceDeviceFaultCm) GetFaultDeviceListKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } + return api.ResourceNamePrefix + cm.DeviceType + CmFaultListSuffix +} + +// GetAvailableDeviceListKey return cm AvailableDeviceListKey +func (cm *AdvanceDeviceFaultCm) GetAvailableDeviceListKey() string { + if cm == nil { + hwlog.RunLog.Error("cm is nil") + } + return api.ResourceNamePrefix + cm.DeviceType +} + +// GetCmName get configmap name of device info +func (cm *DeviceInfo) GetCmName() string { + return cm.CmName +} + +// GetCmName get configmap name of switch info +func (cm *SwitchInfo) GetCmName() string { + return cm.CmName +} + +// GetCmName get configmap name of node info +func (cm *NodeInfo) GetCmName() string { + return cm.CmName +} + +// IsSame compare with another cm +func (cm *DeviceInfo) IsSame(another ConfigMapInterface) bool { + anotherDeviceInfo, ok := another.(*DeviceInfo) + if !ok { + hwlog.RunLog.Warnf("compare with cm which is not DeviceInfo") + return false + } + return !DeviceInfoBusinessDataIsNotEqual(cm, anotherDeviceInfo) +} + +// IsSame compare with another cm +func (cm *SwitchInfo) IsSame(another ConfigMapInterface) bool { + anotherSwitchInfo, ok := another.(*SwitchInfo) + if !ok { + hwlog.RunLog.Warnf("compare with cm which is not SwitchInfo") + return false + } + return !SwitchInfoBusinessDataIsNotEqual(cm, anotherSwitchInfo) +} + +// IsSame compare with another cm +func (cm *NodeInfo) IsSame(another ConfigMapInterface) bool { + anotherNodeInfo, ok := another.(*NodeInfo) + if !ok { + hwlog.RunLog.Warnf("compare with cm which is not NodeInfo") + return false + } + return !NodeInfoBusinessDataIsNotEqual(cm, anotherNodeInfo) +} + +// DeviceInfoBusinessDataIsNotEqual determine the business data is not equal +func DeviceInfoBusinessDataIsNotEqual(oldDevInfo *DeviceInfo, devInfo *DeviceInfo) bool { + if oldDevInfo == nil && devInfo == nil { + hwlog.RunLog.Debug("both oldDevInfo and devInfo are nil") + return false + } + if oldDevInfo == nil || devInfo == nil { + hwlog.RunLog.Debug("one of oldDevInfo and devInfo is not empty, and the other is empty") + return true + } + if len(oldDevInfo.DeviceList) != len(devInfo.DeviceList) { + hwlog.RunLog.Debug("the length of the deviceList of oldDevInfo is not equal to that of the deviceList of devInfo") + return true + } + for nKey, nValue := range oldDevInfo.DeviceList { + oValue, exists := devInfo.DeviceList[nKey] + if !exists || nValue != oValue { + hwlog.RunLog.Debug("neither oldDevInfo nor devInfo is empty, but oldDevInfo is not equal to devInfo") + return true + } + } + hwlog.RunLog.Debug("oldDevInfo is equal to devInfo") + return false +} + +// SwitchInfoBusinessDataIsNotEqual judge is the faultcode and fault level is the same as known, if is not same returns true +func SwitchInfoBusinessDataIsNotEqual(oldSwitch, newSwitch *SwitchInfo) bool { + if oldSwitch == nil && newSwitch == nil { + return false + } + if (oldSwitch != nil && newSwitch == nil) || (oldSwitch == nil && newSwitch != nil) { + return true + } + if newSwitch.FaultLevel != oldSwitch.FaultLevel || newSwitch.NodeStatus != oldSwitch.NodeStatus || + len(newSwitch.FaultCode) != len(oldSwitch.FaultCode) { + return true + } + hwlog.RunLog.Debug("oldSwitch is equal to newSwitch") + return false +} + +// NodeInfoBusinessDataIsNotEqual determine the business data is not equal +func NodeInfoBusinessDataIsNotEqual(oldNodeInfo *NodeInfo, newNodeInfo *NodeInfo) bool { + if oldNodeInfo == nil && newNodeInfo == nil { + hwlog.RunLog.Debug("both oldNodeInfo and newNodeInfo are nil") + return false + } + if oldNodeInfo == nil || newNodeInfo == nil { + hwlog.RunLog.Debug("one of oldNodeInfo and newNodeInfo is not empty, and the other is empty") + return true + } + if oldNodeInfo.NodeStatus != newNodeInfo.NodeStatus || + len(oldNodeInfo.FaultDevList) != len(newNodeInfo.FaultDevList) { + hwlog.RunLog.Debug("neither oldNodeInfo nor newNodeInfo is empty, but oldNodeInfo is not equal to newNodeInfo") + return true + } + hwlog.RunLog.Debug("oldNodeInfo is equal to newNodeInfo") + return false +} diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index 1676aa037..0c303bf5d 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -736,10 +736,8 @@ func TestSortDataForAdvanceDeviceInfo(t *testing.T) { "list1": {{FaultType: "typeA"}, {FaultType: "typeB"}}, }, } - - SortDataForAdvanceDeviceInfo(deviceInfo) - convey.Convey("should be equal", t, func() { + SortDataForAdvanceDeviceInfo(deviceInfo) convey.So(deviceInfo, convey.ShouldResemble, expDeviceInfo) }) } -- Gitee From a5e495cb7bf8957ffa6c8c81ddb9789e74c2a13d Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Mon, 28 Apr 2025 10:15:22 +0800 Subject: [PATCH 22/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20move=20funcs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../uceaccompany/uce_accompany_fault_processor.go | 12 ++++++------ .../uce_accompany_fault_processor_test.go | 3 ++- .../faultmanager/jobprocess/fault_job_center.go | 5 ++--- .../pkg/common/constant/{type_func.go => methods.go} | 0 .../pkg/domain/faultdomain/fault_utils_test.go | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) rename component/clusterd/pkg/common/constant/{type_func.go => methods.go} (100%) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go index 33c00371c..ea51b804f 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go @@ -125,8 +125,8 @@ func (processor *uceAccompanyFaultProcessor) filterFaultDevice( accompanyFaultTime := faultdomain.GetFaultTime(fault, errorMsg) // if is accompanied fault, filter if processor.isAccompaniedFaultByUce(uceFaultTime, accompanyFaultTime) { - hwlog.RunLog.Warnf("filter uce accompany fault %s, fault time: %s", - util.ObjToString(fault), util.ReadableMsTime(accompanyFaultTime)) + hwlog.RunLog.Warnf("filter uce accompany fault %v, fault time: %s", + fault, util.ReadableMsTime(accompanyFaultTime)) deviceFaultCm.FaultDeviceList = faultdomain.DeleteFaultFromFaultMap(deviceFaultCm.FaultDeviceList, fault) modified = true continue @@ -134,8 +134,8 @@ func (processor *uceAccompanyFaultProcessor) filterFaultDevice( // if current is not exceed diagnosis time, // then cannot decide fault is accompany or not, filter, and in que to decide in next turn. if !processor.isCurrentExceedDiagnosisTimeout(currentTime, accompanyFaultTime) { - hwlog.RunLog.Warnf("filter uce accompany like fault %s, fault time: %s", - util.ObjToString(fault), util.ReadableMsTime(accompanyFaultTime)) + hwlog.RunLog.Warnf("filter uce accompany like fault %v, fault time: %s", + fault, util.ReadableMsTime(accompanyFaultTime)) deviceFaultCm.FaultDeviceList = faultdomain.DeleteFaultFromFaultMap(deviceFaultCm.FaultDeviceList, fault) modified = true newDeviceFaultQue = append(newDeviceFaultQue, fault) @@ -144,8 +144,8 @@ func (processor *uceAccompanyFaultProcessor) filterFaultDevice( // cannot filter, add the aic/aiv fault into faultMap deviceFaultCm.FaultDeviceList = faultdomain.AddFaultIntoFaultMap(deviceFaultCm.FaultDeviceList, fault) modified = true - hwlog.RunLog.Warnf("cannot filter uce accompany like fault %s, uce fault time: %s", - util.ObjToString(fault), util.ReadableMsTime(uceFaultTime)) + hwlog.RunLog.Warnf("cannot filter uce accompany like fault %v, uce fault time: %s", + fault, util.ReadableMsTime(uceFaultTime)) } if modified { faultdomain.FixUnhealthyInfo(deviceFaultCm) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go index 28dc25039..63180c9c8 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor_test.go @@ -36,7 +36,8 @@ func TestUceAccompanyFaultProcessorProcess(t *testing.T) { if testFileErr != nil { t.Errorf("init data failed. %v", testFileErr) } - UceAccompanyProcessor.deviceCmForNodeMap = faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos) + UceAccompanyProcessor.deviceCmForNodeMap = + faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos) UceAccompanyProcessor.uceAccompanyFaultInQue() UceAccompanyProcessor.filterFaultInfos(CurrentTime) if !reflect.DeepEqual(UceAccompanyProcessor.deviceCmForNodeMap, diff --git a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go index 3e1211016..f18e4a91f 100644 --- a/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go +++ b/component/clusterd/pkg/application/faultmanager/jobprocess/fault_job_center.go @@ -34,6 +34,8 @@ func init() { relationfault.RelationProcessor, faultrank.JobFaultRankProcessor, }, + mutex: sync.Mutex{}, + subscribeChannelList: make([]*subscriber, 0), } } @@ -56,9 +58,6 @@ func (fJobCenter *faultJobProcessCenter) Register(ch chan map[string]constant.Jo } fJobCenter.mutex.Lock() defer fJobCenter.mutex.Unlock() - if fJobCenter.subscribeChannelList == nil { - fJobCenter.subscribeChannelList = make([]*subscriber, 0) - } length := len(fJobCenter.subscribeChannelList) if length > constant.MaxFaultCenterSubscriber { return fmt.Errorf("the number of registrants is %d, cannot add any more", length) diff --git a/component/clusterd/pkg/common/constant/type_func.go b/component/clusterd/pkg/common/constant/methods.go similarity index 100% rename from component/clusterd/pkg/common/constant/type_func.go rename to component/clusterd/pkg/common/constant/methods.go diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go index 0c303bf5d..06e482e96 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go @@ -622,7 +622,7 @@ func TestAdvanceFaultCmToOriginalFaultCm(t *testing.T) { result := AdvanceFaultCmToOriginalFaultCm[*constant.DeviceInfo](input) - convey.So(len(result), convey.ShouldEqual, 2) + convey.So(len(result), convey.ShouldEqual, len(input)) }) }) } -- Gitee From 3c3ab343fc26fca178892e9af087fec82f7960c1 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Mon, 28 Apr 2025 10:18:02 +0800 Subject: [PATCH 23/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20remove=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/faultdomain/fault_utils_test.go | 801 ------------------ 1 file changed, 801 deletions(-) delete mode 100644 component/clusterd/pkg/domain/faultdomain/fault_utils_test.go diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go b/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go deleted file mode 100644 index 06e482e96..000000000 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils_test.go +++ /dev/null @@ -1,801 +0,0 @@ -// Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. - -// Package faultmanager contain fault process -package faultdomain - -import ( - "reflect" - "sort" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - "k8s.io/api/core/v1" - - "ascend-common/common-utils/hwlog" - "clusterd/pkg/common/constant" - "clusterd/pkg/common/util" -) - -const ( - jobId = "JobId" - nodeName = "Node" - time100Seconds = int64(100000) - time120Seconds = int64(120000) - time1Seconds = int64(1000) - deviceId = "0" - rankID = "8" - cmName = "mindx-dl-deviceinfo-" + nodeName - deviceName = constant.Ascend910 + "-" + deviceId - originalDeviceFaultCodeCnt = 2 -) - -var ( - jobServerMap = constant.JobServerInfoMap{ - InfoMap: map[string]map[string]constant.ServerHccl{ - jobId: { - nodeName: { - DeviceList: []constant.Device{{ - DeviceID: deviceId, - RankID: rankID, - }}, - ServerName: nodeName, - }, - }, - }, - } - originalDeviceCm = &constant.DeviceInfo{ - CmName: cmName, - DeviceInfoNoName: constant.DeviceInfoNoName{ - DeviceList: map[string]string{ - "huawei.com/Ascend910-Fault": ` -[ - { - "fault_type": "CardUnhealthy", - "fault_code": "80E01801 , 80C98009 ", - "fault_time_and_level_map": - { - "80E01801": {"fault_time":100000, "fault_level": "RestartBusiness"}, - "80C98009": {"fault_time":120000, "fault_level": "NotHandleFault"} - },"npu_name": "Ascend910-0" - } -]`, - }, - }, - } -) - -func TestMain(m *testing.M) { - hwlog.InitRunLogger(&hwlog.LogConfig{OnlyToStdout: true}, nil) - m.Run() -} - -func TestSplitDeviceFault(t *testing.T) { - t.Run("TestSplitDeviceFault", func(t *testing.T) { - npuName := "Ascend910-0" - var faultInfo = constant.DeviceFault{ - NPUName: npuName, - FaultCode: "0x1,0x2", - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x1": {FaultLevel: constant.NotHandleFault, FaultTime: 1}, - "0x2": {FaultLevel: constant.SubHealthFault, FaultTime: 1}, - }, - } - - got := splitDeviceFault(faultInfo, "node1") - want := []constant.DeviceFault{ - { - NPUName: npuName, - FaultCode: "0x1", - FaultLevel: constant.NotHandleFault, - LargeModelFaultLevel: constant.NotHandleFault, - FaultHandling: constant.NotHandleFault, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x1": {FaultLevel: constant.NotHandleFault, FaultTime: 1}, - }, - }, { - NPUName: npuName, - FaultCode: "0x2", - FaultLevel: constant.SubHealthFault, - LargeModelFaultLevel: constant.SubHealthFault, - FaultHandling: constant.SubHealthFault, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x2": {FaultLevel: constant.SubHealthFault, FaultTime: 1}, - }, - }, - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("splitDeviceFault() = %v, want %v", got, want) - } - }) -} - -// TestSplitDeviceFaultWithManuallySeparateFaultLevel should split out a DeviceFault as middle data, -// when dp report constant.ManuallySeparateNPU -func TestSplitDeviceFaultWithManuallySeparateFaultLevel(t *testing.T) { - t.Run("TestSplitDeviceFaultWithManuallySeparateFaultLevel", func(t *testing.T) { - npuName := "Ascend910-0" - var faultInfo = constant.DeviceFault{ - NPUName: npuName, - FaultCode: "", - FaultLevel: constant.ManuallySeparateNPU, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{}, - } - - got := splitDeviceFault(faultInfo, "node1") - want := []constant.DeviceFault{ - { - NPUName: npuName, - FaultCode: constant.ManuallySeparateNPU, - FaultLevel: constant.ManuallySeparateNPU, - LargeModelFaultLevel: constant.ManuallySeparateNPU, - FaultHandling: constant.ManuallySeparateNPU, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - constant.ManuallySeparateNPU: { - FaultTime: constant.UnknownFaultTime, - FaultLevel: constant.ManuallySeparateNPU, - }, - }, - }, - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("splitDeviceFault() = %v, want %v", got, want) - } - }) -} - -// TestMergeSameTypeDeviceFault should be merged, when fault type is same -func TestMergeSameTypeDeviceFault(t *testing.T) { - t.Run("Test_mergeDeviceFault", func(t *testing.T) { - npuName := "Ascend910-0" - split := []constant.DeviceFault{ - { - FaultType: constant.CardUnhealthy, - NPUName: npuName, - FaultCode: "0x1", - FaultLevel: constant.NotHandleFault, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x1": {FaultLevel: constant.NotHandleFault, FaultTime: 1}, - "0x2": {FaultLevel: constant.SubHealthFault, FaultTime: 1}, - }, - }, - { - FaultType: constant.CardUnhealthy, - NPUName: npuName, - FaultCode: "0x2", - FaultLevel: constant.SubHealthFault, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x1": {FaultLevel: constant.NotHandleFault, FaultTime: 1}, - "0x2": {FaultLevel: constant.SubHealthFault, FaultTime: 1}, - }, - }, - } - want := []constant.DeviceFault{ - { - FaultType: constant.CardUnhealthy, - NPUName: npuName, - FaultCode: "0x1,0x2", - FaultLevel: constant.SubHealthFault, - LargeModelFaultLevel: constant.SubHealthFault, - FaultHandling: constant.SubHealthFault, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x1": {FaultLevel: constant.NotHandleFault, FaultTime: 1}, - "0x2": {FaultLevel: constant.SubHealthFault, FaultTime: 1}, - }, - }, - } - got, err := mergeDeviceFault(split) - if err != nil { - t.Errorf("mergeDeviceFault() error = %v", err) - } - if !reflect.DeepEqual(got, want) { - t.Errorf("mergeDeviceFault() got = %v, want %v", util.ObjToString(got), util.ObjToString(want)) - } - }) -} - -// TestMergeDifferentTypeDeviceFault should not be merged, when fault type isn't same -func TestMergeDifferentTypeDeviceFault(t *testing.T) { - t.Run("Test_mergeDeviceFault", func(t *testing.T) { - npuName := "Ascend910-0" - split := []constant.DeviceFault{ - { - FaultType: constant.CardUnhealthy, - NPUName: npuName, - FaultCode: "0x1", - FaultLevel: constant.NotHandleFault, - LargeModelFaultLevel: constant.NotHandleFault, - FaultHandling: constant.NotHandleFault, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x1": {FaultLevel: constant.NotHandleFault, FaultTime: 1}, - }, - }, - { - FaultType: constant.CardNetworkUnhealthy, - NPUName: npuName, - FaultCode: "0x2", - FaultLevel: constant.SubHealthFault, - LargeModelFaultLevel: constant.SubHealthFault, - FaultHandling: constant.SubHealthFault, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - "0x2": {FaultLevel: constant.SubHealthFault, FaultTime: 1}, - }, - }, - } - got, err := mergeDeviceFault(split) - if err != nil { - t.Errorf("mergeDeviceFault() error = %v", err) - } - sort.Slice(got, func(i, j int) bool { - return got[i].FaultType > got[j].FaultType - }) - if !reflect.DeepEqual(got, split) { - t.Errorf("mergeDeviceFault() got = %v, want %v", util.ObjToString(got), util.ObjToString(split)) - } - }) -} - -// TestMergeManuallySeparateNPUTypeDeviceFault should combine other fault info and constant.ManuallySeparateNPU. -func TestMergeManuallySeparateNPUTypeDeviceFault(t *testing.T) { - t.Run("TestMergeManuallySeparateNPUTypeDeviceFault", func(t *testing.T) { - npuName := "Ascend910-0" - split := []constant.DeviceFault{ - { - FaultType: constant.CardUnhealthy, - NPUName: npuName, - FaultCode: constant.ManuallySeparateNPU, - FaultLevel: constant.ManuallySeparateNPU, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{}, - }, - { - FaultType: constant.CardUnhealthy, - NPUName: npuName, - FaultCode: constant.UceFaultCode, - FaultLevel: constant.RestartBusiness, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - constant.UceFaultCode: { - FaultTime: constant.UnknownFaultTime, - FaultLevel: constant.RestartBusiness, - }, - }, - }, - } - got, err := mergeDeviceFault(split) - if err != nil { - t.Errorf("mergeDeviceFault() error = %v", err) - } - want := []constant.DeviceFault{ - { - FaultType: constant.CardUnhealthy, - NPUName: npuName, - FaultCode: constant.UceFaultCode, - FaultLevel: constant.ManuallySeparateNPU, - LargeModelFaultLevel: constant.ManuallySeparateNPU, - FaultHandling: constant.ManuallySeparateNPU, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - constant.UceFaultCode: { - FaultTime: constant.UnknownFaultTime, - FaultLevel: constant.RestartBusiness, - }, - }, - }, - } - if !reflect.DeepEqual(got, want) { - t.Error("TestMergeManuallySeparateNPUTypeDeviceFault fail") - } - }) -} - -// TestGetAdvanceDeviceCm should get advanceDeviceCm from originalDeviceCm -func TestGetAdvanceDeviceCm(t *testing.T) { - advanceDeviceCm := GetAdvanceDeviceCm(originalDeviceCm) - if len(advanceDeviceCm.FaultDeviceList[deviceName]) != originalDeviceFaultCodeCnt { - t.Errorf("TestGetAdvanceDeviceCm failed") - return - } - faultTimeAndLevel, ok := advanceDeviceCm.FaultDeviceList[deviceName][0].FaultTimeAndLevelMap[constant.UceFaultCode] - if !ok || faultTimeAndLevel.FaultTime != time100Seconds || - faultTimeAndLevel.FaultLevel != constant.RestartBusiness { - t.Errorf("TestGetAdvanceDeviceCm failed") - return - } -} - -// TestValidBusinessUceReportInfo valid business uce report info -func TestValidBusinessUceReportInfo(t *testing.T) { - t.Run("TestValidBusinessUceReportInfo", func(t *testing.T) { - reportInfo := &constant.ReportInfo{ - RecoverTime: time100Seconds - time1Seconds, - CompleteTime: 0, - } - mockTime := time.Time{} - mockUnixMilli := gomonkey.ApplyPrivateMethod(mockTime, "UnixMilli", func() int64 { - return time100Seconds - }) - mockNow := gomonkey.ApplyFunc(time.Now, func() time.Time { - return mockTime - }) - defer func() { - mockNow.Reset() - mockUnixMilli.Reset() - }() - if !ValidBusinessUceReportInfo(reportInfo) { - t.Error("TestValidBusinessUceReportInfo fail") - } - reportInfo.RecoverTime = 0 - if ValidBusinessUceReportInfo(reportInfo) { - t.Error("TestValidBusinessUceReportInfo fail") - } - }) -} - -// TestCanDoStepRetry check uceDeviceInfo can do step retry -func TestCanDoStepRetry(t *testing.T) { - uceDeviceInfo := &constant.UceDeviceInfo{ - DeviceName: deviceName, - FaultTime: time100Seconds, - RecoverTime: time100Seconds + time1Seconds, - CompleteTime: 0, - } - t.Run("TestCanDoStepRetry", func(t *testing.T) { - mockTime := time.Time{} - mockUnixMilli := gomonkey.ApplyPrivateMethod(mockTime, "UnixMilli", func() int64 { - return time120Seconds - }) - mockNow := gomonkey.ApplyFunc(time.Now, func() time.Time { - return mockTime - }) - defer func() { - mockNow.Reset() - mockUnixMilli.Reset() - }() - if !CanDoStepRetry(uceDeviceInfo) { - t.Error("TestCanDoStepRetry fail") - } - }) -} - -// TestGetContainedElementIdx should return id of the item from slice -func TestGetContainedElementIdx(t *testing.T) { - arr := []string{"1", "2"} - t.Run("TestGetContainedElementIdx", func(t *testing.T) { - if got := GetContainedElementIdx("1", arr); got != 0 { - t.Error("GetContainedElementIdx() fail") - } - if got := GetContainedElementIdx("3", arr); got != -1 { - t.Error("GetContainedElementIdx() fail") - } - }) -} - -// TestGetFaultTime should return fault time from DeviceFault -func TestGetFaultTime(t *testing.T) { - fault := constant.DeviceFault{ - FaultCode: constant.UceFaultCode, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - constant.UceFaultCode: { - FaultTime: time100Seconds, - FaultLevel: constant.RestartBusiness, - }, - }, - } - t.Run("TestGetFaultTime", func(t *testing.T) { - if got := GetFaultTime(fault, ""); got != time100Seconds { - t.Error("GetFaultTime fail") - } - fault.FaultTimeAndLevelMap = make(map[string]constant.FaultTimeAndLevel) - if got := GetFaultTime(fault, ""); got != constant.DeviceNotFault { - t.Error("GetFaultTime fail") - } - }) -} - -// TestFaultCodeJudge check fault code is right -func TestFaultCodeJudge(t *testing.T) { - t.Run("TestFaultCodeJudgeAic", func(t *testing.T) { - if got := IsUceAccompanyFault(constant.AicFaultCode); got == false { - t.Error("TestFaultCodeJudgeAic fail") - } - }) - t.Run("TestFaultCodeJudgeLinkDownFault", func(t *testing.T) { - if got := IsLinkDownFault(constant.LinkDownFaultCode); got == false { - t.Error("TestFaultCodeJudgeLinkDownFault fail") - } - }) - t.Run("TestFaultCodeJudgeCqeFault", func(t *testing.T) { - if got := IsCqeFault(constant.DevCqeFaultCode); got == false { - t.Error("TestFaultCodeJudgeCqeFault fail") - } - }) - t.Run("TestFaultCodeJudgeUceFault", func(t *testing.T) { - if got := IsUceFault(constant.UceFaultCode); got == false { - t.Error("TestFaultCodeJudgeUceFault fail") - } - }) -} - -// TestAddFaultAndDeleteFaultMap should add or delete fault right -func TestAddFaultAndDeleteFaultMap(t *testing.T) { - addFault := constant.DeviceFault{ - NPUName: deviceName, - } - t.Run("TestAddFaultIntoFaultMap", func(t *testing.T) { - faultMap := AddFaultIntoFaultMap(nil, addFault) - if len(faultMap[addFault.NPUName]) != 1 { - t.Error("TestAddFaultIntoFaultMap fail") - } - }) - t.Run("TestDeleteFaultFromFaultMap", func(t *testing.T) { - faultMap := DeleteFaultFromFaultMap(nil, addFault) - if len(faultMap[addFault.NPUName]) != 0 { - t.Error("TestDeleteFaultFromFaultMap fail") - } - faultMap = AddFaultIntoFaultMap(nil, addFault) - faultMap = DeleteFaultFromFaultMap(faultMap, addFault) - if len(faultMap[addFault.NPUName]) != 0 { - t.Error("TestDeleteFaultFromFaultMap fail") - } - }) -} - -// TestGetAdvanceDeviceCmForNodeMap should get AdvanceDeviceCm -func TestGetAdvanceDeviceCmForNodeMap(t *testing.T) { - deviceInfoCms := map[string]*constant.DeviceInfo{ - cmName: originalDeviceCm, - } - t.Run("TestGetAdvanceDeviceConfigmap", func(t *testing.T) { - got := GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](deviceInfoCms) - if len(got[nodeName].FaultDeviceList[deviceName]) != originalDeviceFaultCodeCnt { - t.Error("TestGetAdvanceDeviceConfigmap fail") - } - }) -} - -// TestGetNodeAndDeviceFromJobIdAndRankId should return right node and device according to the jobId and rankID -func TestGetNodeAndDeviceFromJobIdAndRankId(t *testing.T) { - t.Run("TestGetNodeAndDeviceFromJobIdAndRankId", func(t *testing.T) { - serverName, device, err := GetNodeAndDeviceFromJobIdAndRankId(jobId, rankID, jobServerMap) - if serverName != nodeName || device != deviceId || err != nil { - t.Error("TestGetNodeAndDeviceFromJobIdAndRankId fail") - } - }) -} - -// TestIsNodeReady check node is ready -func TestIsNodeReady(t *testing.T) { - node := &v1.Node{ - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{{ - Type: v1.NodeReady, - Status: v1.ConditionTrue, - }}, - }, - } - t.Run("TestIsNodeReady", func(t *testing.T) { - if !IsNodeReady(node) { - t.Error("TestIsNodeReady fail") - } - }) -} - -func getCardNotHandleFaults() []constant.DeviceFault { - return append([]constant.DeviceFault{}, constant.DeviceFault{ - FaultType: constant.CardUnhealthy, - FaultLevel: constant.NotHandleFault, - }) -} - -func getCardNotHandleAndPublicFaultSeparateNPU() []constant.DeviceFault { - return append([]constant.DeviceFault{}, constant.DeviceFault{ - FaultType: constant.CardUnhealthy, - FaultLevel: constant.NotHandleFault, - }, constant.DeviceFault{ - FaultType: constant.PublicFaultType, - FaultLevel: constant.SeparateNPU, - }) -} - -func getCardNetworkNotHandleAndPublicFaultSeparateNPU() []constant.DeviceFault { - return append([]constant.DeviceFault{}, constant.DeviceFault{ - FaultType: constant.CardNetworkUnhealthy, - FaultLevel: constant.NotHandleFault, - }, constant.DeviceFault{ - FaultType: constant.PublicFaultType, - FaultLevel: constant.SeparateNPU, - }) -} - -func getCardNotHandleAndPublicFaultSubHealth() []constant.DeviceFault { - return append([]constant.DeviceFault{}, constant.DeviceFault{ - FaultType: constant.CardUnhealthy, - FaultLevel: constant.NotHandleFault, - }, constant.DeviceFault{ - FaultType: constant.PublicFaultType, - FaultLevel: constant.SubHealthFault, - }) -} - -// TestIsFaultDeletable check faults in specified fault type are NotHandleFault and SubHealthFault -func TestIsFaultDeletable(t *testing.T) { - t.Run("TestIsFaultDeletable", func(t *testing.T) { - deletableFaultLevels := []string{constant.NotHandleFault, constant.SubHealthFault} - faults := getCardNotHandleFaults() - if !isFaultDeletable(faults, []string{constant.CardUnhealthy}, deletableFaultLevels) { - t.Error("when only NotHandleFault in CardUnhealthy then should remove") - } - if !isFaultDeletable(faults, []string{constant.CardNetworkUnhealthy}, deletableFaultLevels) { - t.Error("when no fault in CardNetworkUnhealthy then should remove from CardNetworkUnhealthy") - } - - faults = getCardNotHandleAndPublicFaultSeparateNPU() - if isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { - t.Error("when PublicFaultType is SeparateNPU then should not remove from CardUnhealthy") - } - - faults = getCardNetworkNotHandleAndPublicFaultSeparateNPU() - if !isFaultDeletable(faults, []string{constant.CardNetworkUnhealthy}, deletableFaultLevels) { - t.Error("when PublicFaultType is SeparateNPU and CardNetworkUnhealthy is NotHandleFault " + - "then should remove from CardNetworkUnhealthy") - } - - faults = append([]constant.DeviceFault{}, constant.DeviceFault{FaultType: constant.PublicFaultType}) - if isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { - t.Error("when PublicFaultType is SeparateNPU then should not remove from CardUnhealthy") - } - faults = make([]constant.DeviceFault, 0) - if !isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { - t.Error("when no faults then should remove from CardUnhealthy") - } - faults = getCardNotHandleAndPublicFaultSubHealth() - if !isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { - t.Error("when SubHealthFault and NotHandleFault faults then should remove from CardUnhealthy") - } - }) -} - -// TestGetAdvanceFaultForNode test get advance fault info for node -func TestGetAdvanceFaultForNode(t *testing.T) { - patches := gomonkey.NewPatches() - defer patches.Reset() - - convey.Convey("Test GetAdvanceFaultForNode", t, func() { - convey.Convey("Case 1: DeviceInfo input", func() { - deviceInfo := &constant.DeviceInfo{} - expected := &constant.AdvanceDeviceFaultCm{} - - patches.ApplyFunc(GetAdvanceDeviceCm, func(*constant.DeviceInfo) *constant.AdvanceDeviceFaultCm { - return expected - }) - - result := GetAdvanceFaultForNode(deviceInfo) - convey.So(result, convey.ShouldEqual, expected) - }) - - convey.Convey("Case 2: NodeInfo input", func() { - nodeInfo := &constant.NodeInfo{} - result := GetAdvanceFaultForNode(nodeInfo) - convey.So(result, convey.ShouldEqual, nodeInfo) - }) - - convey.Convey("Case 3: SwitchInfo input", func() { - switchInfo := &constant.SwitchInfo{} - result := GetAdvanceFaultForNode(switchInfo) - convey.So(result, convey.ShouldEqual, switchInfo) - }) - - convey.Convey("Case 4: AdvanceDeviceFaultCm input", func() { - advanceCm := &constant.AdvanceDeviceFaultCm{} - result := GetAdvanceFaultForNode(advanceCm) - convey.So(result, convey.ShouldEqual, advanceCm) - }) - }) -} - -func TestAdvanceFaultCmToOriginalFaultCm(t *testing.T) { - convey.Convey("Test AdvanceFaultCmToOriginalFaultCm", t, func() { - node1 := "node1" - node2 := "node2" - mockAdvanceCm1 := &constant.AdvanceDeviceFaultCm{ - DeviceType: "", - CmName: "CmName-" + node1, - SuperPodID: 0, - ServerIndex: 0, - FaultDeviceList: make(map[string][]constant.DeviceFault), - AvailableDeviceList: []string{"xxx"}, - Recovering: []string{"xxx"}, - CardUnHealthy: []string{"xxx"}, - NetworkUnhealthy: []string{"xxx"}, - UpdateTime: 0, - } - mockAdvanceCm2 := new(constant.AdvanceDeviceFaultCm) - util.DeepCopy(mockAdvanceCm1, mockAdvanceCm2) - mockAdvanceCm2.CmName = "CmName-" + node2 - - convey.Convey("should convert map correctly", func() { - input := map[string]constant.ConfigMapInterface{ - node1: mockAdvanceCm1, - node2: mockAdvanceCm2, - } - - result := AdvanceFaultCmToOriginalFaultCm[*constant.DeviceInfo](input) - - convey.So(len(result), convey.ShouldEqual, len(input)) - }) - }) -} - -func TestGetSortedKeys(t *testing.T) { - patches := gomonkey.NewPatches() - defer patches.Reset() - - convey.Convey("Test getSortedKeys", t, func() { - convey.Convey("should return sorted keys for string-struct map", func() { - type testStruct struct{ val int } - input := map[string]testStruct{ - "zebra": {1}, - "lion": {2}, - "ape": {3}, - } - expected := []string{"ape", "lion", "zebra"} - result := getSortedKeys(input) - convey.So(result, convey.ShouldResemble, expected) - }) - - convey.Convey("should handle empty map", func() { - input := map[string]float64{} - result := getSortedKeys(input) - convey.So(result, convey.ShouldBeEmpty) - }) - }) -} - -func TestCompareFaultTimeAndLevel(t *testing.T) { - convey.Convey("should compare by FaultTime first", t, func() { - a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} - b := constant.FaultTimeAndLevel{FaultTime: 200, FaultLevel: "high"} - convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeLessThan, 0) - }) - - convey.Convey("should compare by FaultLevel when FaultTime equal", t, func() { - a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "low"} - b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "high"} - convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldBeGreaterThan, 0) - }) - - convey.Convey("should return 0 when both equal", t, func() { - a := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} - b := constant.FaultTimeAndLevel{FaultTime: 100, FaultLevel: "medium"} - convey.So(compareFaultTimeAndLevel(a, b), convey.ShouldEqual, 0) - }) -} - -func TestCompareDeviceFault(t *testing.T) { - convey.Convey("Test compareDeviceFault", t, func() { - baseFault := constant.DeviceFault{ - FaultType: constant.CardUnhealthy, - NPUName: "npu0", - LargeModelFaultLevel: constant.SubHealthFault, - FaultLevel: constant.SubHealthFault, - FaultHandling: constant.SubHealthFault, - FaultCode: constant.AicFaultCode, - FaultTimeAndLevelMap: map[string]constant.FaultTimeAndLevel{ - constant.AicFaultCode: { - FaultTime: 0, - FaultLevel: constant.SubHealthFault, - }, - }, - } - - convey.Convey("should compare by FaultType first", func() { - f1 := baseFault - f2 := baseFault - convey.So(compareDeviceFault(f1, f2), convey.ShouldEqual, 0) - }) - - convey.Convey("should compare by NPUName when FaultType equal", func() { - f1 := baseFault - f2 := baseFault - f2.NPUName = "npu1" - convey.So(compareDeviceFault(f1, f2), convey.ShouldBeLessThan, 0) - }) - - convey.Convey("should compare FaultTimeAndLevelMap when all fields equal", func() { - f1 := baseFault - f1.FaultTimeAndLevelMap = map[string]constant.FaultTimeAndLevel{ - "key1": {FaultTime: 100, FaultLevel: "low"}, - } - f2 := baseFault - f2.FaultTimeAndLevelMap = map[string]constant.FaultTimeAndLevel{ - "key1": {FaultTime: 200, FaultLevel: "low"}, - } - convey.So(compareDeviceFault(f1, f2), convey.ShouldBeLessThan, 0) - }) - }) -} - -func TestSortDataForAdvanceDeviceInfo(t *testing.T) { - deviceInfo := &constant.AdvanceDeviceFaultCm{ - AvailableDeviceList: []string{"d3", "d1", "d2"}, - CardUnHealthy: []string{"c2", "c1"}, - NetworkUnhealthy: []string{"n2", "n1"}, - Recovering: []string{"r2", "r1"}, - FaultDeviceList: map[string][]constant.DeviceFault{ - "list1": {{FaultType: "typeB"}, {FaultType: "typeA"}}, - }, - } - - expDeviceInfo := &constant.AdvanceDeviceFaultCm{ - AvailableDeviceList: []string{"d1", "d2", "d3"}, - CardUnHealthy: []string{"c1", "c2"}, - NetworkUnhealthy: []string{"n1", "n2"}, - Recovering: []string{"r1", "r2"}, - FaultDeviceList: map[string][]constant.DeviceFault{ - "list1": {{FaultType: "typeA"}, {FaultType: "typeB"}}, - }, - } - convey.Convey("should be equal", t, func() { - SortDataForAdvanceDeviceInfo(deviceInfo) - convey.So(deviceInfo, convey.ShouldResemble, expDeviceInfo) - }) -} - -func TestMergeCode(t *testing.T) { - patches := gomonkey.NewPatches() - defer patches.Reset() - convey.Convey("Test mergeCode", t, func() { - testDevice := "device1" - orgFaults := []constant.DeviceFault{{FaultCode: "1001"}} - mergedFaults := []constant.DeviceFault{{FaultCode: "merged"}} - advanceCm := &constant.AdvanceDeviceFaultCm{ - FaultDeviceList: map[string][]constant.DeviceFault{ - testDevice: orgFaults, - }, - } - patches.ApplyFunc(mergeDeviceFault, func([]constant.DeviceFault) ([]constant.DeviceFault, error) { - return mergedFaults, nil - }) - convey.Convey("should skip empty fault lists", func() { - mergeCode(advanceCm) - convey.So(advanceCm.FaultDeviceList[testDevice], convey.ShouldResemble, mergedFaults) - }) - }) -} - -func TestFixUnhealthyInfo(t *testing.T) { - patches := gomonkey.NewPatches() - defer patches.Reset() - convey.Convey("Test FixUnhealthyInfo", t, func() { - baseCm := &constant.AdvanceDeviceFaultCm{ - AvailableDeviceList: []string{"device0", "device1", "device2", "device3", "device4", "device5", "device6"}, - CardUnHealthy: []string{"device1", "device2", "device3", "device4"}, - NetworkUnhealthy: []string{"device1", "device2", "device3"}, - FaultDeviceList: map[string][]constant.DeviceFault{ - "device1": { - {FaultType: constant.CardUnhealthy, FaultLevel: constant.NotHandleFault}, - }, - "device2": { - {FaultType: constant.CardNetworkUnhealthy, FaultLevel: constant.SubHealthFault}, - }, - "device3": { - {FaultType: constant.PublicFaultType, FaultLevel: constant.SubHealthFault}, - }, - "device4": { - {FaultType: constant.PublicFaultType, FaultLevel: constant.SeparateNPU}, - }, - "device5": { - {FaultType: constant.CardNetworkUnhealthy, FaultLevel: constant.SeparateNPU}, - }, - }, - } - convey.Convey("should remove from unhealthy", func() { - FixUnhealthyInfo(baseCm) - convey.So(baseCm.CardUnHealthy, convey.ShouldResemble, []string{"device4"}) - convey.So(baseCm.NetworkUnhealthy, convey.ShouldResemble, []string{"device5"}) - convey.So(baseCm.AvailableDeviceList, convey.ShouldResemble, - []string{"device0", "device1", "device2", "device3", "device6"}) - }) - }) -} -- Gitee From b2a649581e51874bf65d332984403e3347cf354f Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Mon, 28 Apr 2025 10:23:10 +0800 Subject: [PATCH 24/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20remove=20go=20mod?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/clusterd/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/component/clusterd/go.mod b/component/clusterd/go.mod index eba3d3abe..76e7dc4b1 100644 --- a/component/clusterd/go.mod +++ b/component/clusterd/go.mod @@ -7,7 +7,6 @@ require ( github.com/agiledragon/gomonkey/v2 v2.8.0 github.com/fsnotify/fsnotify v1.6.0 github.com/golang/protobuf v1.5.3 - github.com/pkg/errors v0.9.1 github.com/smartystreets/goconvey v1.7.2 github.com/stretchr/testify v1.8.0 golang.org/x/time v0.3.0 @@ -42,6 +41,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/smartystreets/assertions v1.2.0 // indirect github.com/spf13/pflag v1.0.5 // indirect -- Gitee From 50e27e074cdb61d5d95e0d5ff07dd4857034b6f2 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Mon, 28 Apr 2025 14:33:02 +0800 Subject: [PATCH 25/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20=E6=A3=80=E8=A7=86=E6=84=8F?= =?UTF-8?q?=E8=A7=81=E8=A7=A3=E5=86=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/clusterd/go.mod | 2 +- .../pkg/application/resource/report.go | 4 ++-- .../clusterd/pkg/common/constant/methods.go | 20 +++++++++---------- .../pkg/domain/faultdomain/fault_utils.go | 12 +++++------ 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/component/clusterd/go.mod b/component/clusterd/go.mod index 76e7dc4b1..6045de4d3 100644 --- a/component/clusterd/go.mod +++ b/component/clusterd/go.mod @@ -1,6 +1,6 @@ module clusterd -go 1.20 +go 1.21 require ( ascend-common v0.0.0 diff --git a/component/clusterd/pkg/application/resource/report.go b/component/clusterd/pkg/application/resource/report.go index 8dedc21b5..eb3051659 100644 --- a/component/clusterd/pkg/application/resource/report.go +++ b/component/clusterd/pkg/application/resource/report.go @@ -66,7 +66,7 @@ func Report(ctx context.Context) { }) switch whichToReport { case constant.DeviceProcessType: - deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm[*constant.DeviceInfo]( + deviceArr := device.GetSafeData(faultdomain.AdvanceFaultMapToOriginalFaultMap[*constant.DeviceInfo]( faultmanager.QueryDeviceInfoToReport())) updateDeviceInfoCm(deviceArr) case constant.NodeProcessType: @@ -76,7 +76,7 @@ func Report(ctx context.Context) { switchArr := switchinfo.GetSafeData(faultmanager.QuerySwitchInfoToReport()) updateSwitchInfoCm(switchArr) case constant.AllProcessType: - deviceArr := device.GetSafeData(faultdomain.AdvanceFaultCmToOriginalFaultCm[*constant.DeviceInfo]( + deviceArr := device.GetSafeData(faultdomain.AdvanceFaultMapToOriginalFaultMap[*constant.DeviceInfo]( faultmanager.QueryDeviceInfoToReport())) nodeArr := node.GetSafeData(faultmanager.QueryNodeInfoToReport()) switchArr := switchinfo.GetSafeData(faultmanager.QuerySwitchInfoToReport()) diff --git a/component/clusterd/pkg/common/constant/methods.go b/component/clusterd/pkg/common/constant/methods.go index ab3a0f7d8..e637732a0 100644 --- a/component/clusterd/pkg/common/constant/methods.go +++ b/component/clusterd/pkg/common/constant/methods.go @@ -12,16 +12,6 @@ import ( "ascend-common/common-utils/hwlog" ) -func equalDeviceFault(one, other *DeviceFault) bool { - return one.FaultType == other.FaultType && - one.NPUName == other.NPUName && - one.LargeModelFaultLevel == other.LargeModelFaultLevel && - one.FaultLevel == other.FaultLevel && - one.FaultHandling == other.FaultHandling && - one.FaultCode == other.FaultCode && - maps.Equal(one.FaultTimeAndLevelMap, other.FaultTimeAndLevelMap) -} - // IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { if cm == nil { @@ -206,3 +196,13 @@ func NodeInfoBusinessDataIsNotEqual(oldNodeInfo *NodeInfo, newNodeInfo *NodeInfo hwlog.RunLog.Debug("oldNodeInfo is equal to newNodeInfo") return false } + +func equalDeviceFault(one, other *DeviceFault) bool { + return one.FaultType == other.FaultType && + one.NPUName == other.NPUName && + one.LargeModelFaultLevel == other.LargeModelFaultLevel && + one.FaultLevel == other.FaultLevel && + one.FaultHandling == other.FaultHandling && + one.FaultCode == other.FaultCode && + maps.Equal(one.FaultTimeAndLevelMap, other.FaultTimeAndLevelMap) +} diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index ab0ab18ad..b263ac344 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -300,18 +300,18 @@ func AddFaultIntoFaultMap(faultMap map[string][]constant.DeviceFault, return faultMap } -func AdvanceFaultCmToOriginalFaultCm[U, T constant.ConfigMapInterface](advanceFaultCm map[string]T) map[string]U { +func AdvanceFaultMapToOriginalFaultMap[U, T constant.ConfigMapInterface](advanceFaultCm map[string]T) map[string]U { orgFaultCm := make(map[string]U) for _, advanceCmForNode := range advanceFaultCm { - orgFaultCm[advanceCmForNode.GetCmName()] = AdvanceCmToOriginalCmForNode(advanceCmForNode).(U) + orgFaultCm[advanceCmForNode.GetCmName()] = AdvanceCmToOriginalCm(advanceCmForNode).(U) } return orgFaultCm } -func AdvanceCmToOriginalCmForNode[T constant.ConfigMapInterface](advanceCmForNode T) constant.ConfigMapInterface { +func AdvanceCmToOriginalCm[T constant.ConfigMapInterface](advanceCmForNode T) constant.ConfigMapInterface { switch cm := any(advanceCmForNode).(type) { case *constant.AdvanceDeviceFaultCm: - return AdvanceDevCmToOrigCmForNode(cm) + return AdvanceDevCmToOrigCm(cm) case *constant.SwitchInfo: return cm case *constant.NodeInfo: @@ -322,8 +322,8 @@ func AdvanceCmToOriginalCmForNode[T constant.ConfigMapInterface](advanceCmForNod } } -// AdvanceDevCmToOrigCmForNode convert advance device cm to original format -func AdvanceDevCmToOrigCmForNode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) *constant.DeviceInfo { +// AdvanceDevCmToOrigCm convert advance device cm to original format +func AdvanceDevCmToOrigCm(advanceDeviceCm *constant.AdvanceDeviceFaultCm) *constant.DeviceInfo { orgDeviceCm := &constant.DeviceInfo{ DeviceInfoNoName: constant.DeviceInfoNoName{ DeviceList: make(map[string]string), -- Gitee From f0484d9eaa51134f34359ff0d7be04a8538407d4 Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Mon, 28 Apr 2025 22:56:31 +0800 Subject: [PATCH 26/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20bug=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../faultmanager/cmprocess/base_fault_center.go | 12 +++++++++++- .../cmprocess/publicfault/pub_fault_processor.go | 4 +--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go index ce18ba288..ff450ceb1 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/base_fault_center.go @@ -9,6 +9,7 @@ import ( "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" + "clusterd/pkg/domain/faultdomain" "clusterd/pkg/domain/faultdomain/cmmanager" ) @@ -32,7 +33,16 @@ func newBaseFaultCenter[T constant.ConfigMapInterface](cmManager *cmmanager.Faul func (baseCenter *baseFaultCenter[T]) Process() { updateOriginalCm := baseCenter.updateOriginalCm() - processingCm := baseCenter.getOriginalCm() + origCm := baseCenter.getOriginalCm() + var processingCm map[string]T + if baseCenter.centerType == constant.DeviceProcessType { + processingCm = make(map[string]T) + for cmName, deviceInfo := range origCm { + processingCm[faultdomain.CmNameToNodeName(cmName)] = deviceInfo + } + } else { + processingCm = origCm + } for _, processor := range baseCenter.processorList { info := constant.OneConfigmapContent[T]{ AllConfigmap: processingCm, diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index a790f7d1b..1d241a04b 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -5,7 +5,6 @@ package publicfault import ( "strconv" - "strings" "ascend-common/common-utils/hwlog" "clusterd/pkg/common/constant" @@ -45,8 +44,7 @@ func (p *pubFaultProcessor) Process(info any) any { hwlog.RunLog.Errorf("public fault processor process failed, error: %v", err) return processContent } - for devCMName, devCMInfo := range deviceInfos { - nodeName := strings.TrimPrefix(devCMName, constant.DeviceInfoPrefix) + for nodeName, devCMInfo := range deviceInfos { pubFaults, ok := copyFaultCache[nodeName] if !ok { continue -- Gitee From 18aa172fb04b8da3a532e570ed1affdb3d0cae8b Mon Sep 17 00:00:00 2001 From: lirui238 <2396601465@qq.com> Date: Wed, 30 Apr 2025 16:31:45 +0800 Subject: [PATCH 27/27] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91[clusterd]=20bugfix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../publicfault/pub_fault_processor.go | 3 +- .../cmprocess/uce/uce_fault_processor.go | 10 +- .../cmprocess/uce/uce_fault_processor_test.go | 3 +- .../uce_accompany_fault_processor.go | 7 +- .../clusterd/pkg/common/constant/methods.go | 103 ++++++++++++++++++ .../pkg/domain/faultdomain/fault_utils.go | 71 ------------ 6 files changed, 113 insertions(+), 84 deletions(-) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go index 1d241a04b..3a2ce9776 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/publicfault/pub_fault_processor.go @@ -76,12 +76,11 @@ func (p *pubFaultProcessor) faultJoin() { FaultLevel: pubFaultCache.FaultLevel, }}, } - p.devCMInfo.FaultDeviceList[faultDevName] = append(p.devCMInfo.FaultDeviceList[faultDevName], fault) + p.devCMInfo.AddFault(fault) modified = true } } if modified { - faultdomain.FixUnhealthyInfo(p.devCMInfo) faultdomain.SortDataForAdvanceDeviceInfo(p.devCMInfo) } } diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go index fca9afc40..3d8555d44 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor.go @@ -117,7 +117,7 @@ func (processor *uceFaultProcessor) processEachNodeUceFaultInfo( util.ReadableMsTime(uceDevice.RecoverTime)) if processor.canFilterUceDeviceFaultInfo(uceDevice, currentTime) { hwlog.RunLog.Warn("uceFaultProcessor " + log) - deviceInfo.FaultDeviceList = processor.filterUceDeviceFaultInfo(deviceName, deviceInfo.FaultDeviceList) + processor.filterUceDeviceFaultInfo(deviceName, deviceInfo) modified = true } else { hwlog.RunLog.Warn("uceFaultProcessor cannot " + log) @@ -125,21 +125,19 @@ func (processor *uceFaultProcessor) processEachNodeUceFaultInfo( } } if modified { - faultdomain.FixUnhealthyInfo(deviceInfo) faultdomain.SortDataForAdvanceDeviceInfo(deviceInfo) } return deviceInfo } func (processor *uceFaultProcessor) filterUceDeviceFaultInfo( - deviceName string, deviceFaultMap map[string][]constant.DeviceFault) map[string][]constant.DeviceFault { - for _, fault := range deviceFaultMap[deviceName] { + deviceName string, advanceDevInfo *constant.AdvanceDeviceFaultCm) { + for _, fault := range advanceDevInfo.FaultDeviceList[deviceName] { // filter device's uce fault if faultdomain.IsUceFault(fault.FaultCode) { - deviceFaultMap = faultdomain.DeleteFaultFromFaultMap(deviceFaultMap, fault) + advanceDevInfo.DelFault(fault) } } - return deviceFaultMap } func (processor *uceFaultProcessor) canFilterUceDeviceFaultInfo(uceDevice constant.UceDeviceInfo, currentTime int64) bool { diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go index 07dfebca0..1a69daa82 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uce/uce_fault_processor_test.go @@ -332,7 +332,8 @@ func TestUceFaultProcessorProcessUceFaultInfo(t *testing.T) { result := UceProcessor.nodeDeviceCmMap want := faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](expectProcessedDeviceInfos) if !reflect.DeepEqual(result, want) { - t.Errorf("result:\n%v\n\nwant:\n%v", + t.Errorf("orgcm:\n%v\n\nresult:\n%v\n\nwant:\n%v", + util.ObjToString(faultdomain.GetAdvanceFaultCm[*constant.AdvanceDeviceFaultCm](cmDeviceInfos)), util.ObjToString(result), util.ObjToString(want)) } }) diff --git a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go index ea51b804f..16e4a64ef 100644 --- a/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go +++ b/component/clusterd/pkg/application/faultmanager/cmprocess/uceaccompany/uce_accompany_fault_processor.go @@ -127,7 +127,7 @@ func (processor *uceAccompanyFaultProcessor) filterFaultDevice( if processor.isAccompaniedFaultByUce(uceFaultTime, accompanyFaultTime) { hwlog.RunLog.Warnf("filter uce accompany fault %v, fault time: %s", fault, util.ReadableMsTime(accompanyFaultTime)) - deviceFaultCm.FaultDeviceList = faultdomain.DeleteFaultFromFaultMap(deviceFaultCm.FaultDeviceList, fault) + deviceFaultCm.DelFault(fault) modified = true continue } @@ -136,19 +136,18 @@ func (processor *uceAccompanyFaultProcessor) filterFaultDevice( if !processor.isCurrentExceedDiagnosisTimeout(currentTime, accompanyFaultTime) { hwlog.RunLog.Warnf("filter uce accompany like fault %v, fault time: %s", fault, util.ReadableMsTime(accompanyFaultTime)) - deviceFaultCm.FaultDeviceList = faultdomain.DeleteFaultFromFaultMap(deviceFaultCm.FaultDeviceList, fault) + deviceFaultCm.DelFault(fault) modified = true newDeviceFaultQue = append(newDeviceFaultQue, fault) continue } // cannot filter, add the aic/aiv fault into faultMap - deviceFaultCm.FaultDeviceList = faultdomain.AddFaultIntoFaultMap(deviceFaultCm.FaultDeviceList, fault) + deviceFaultCm.AddFault(fault) modified = true hwlog.RunLog.Warnf("cannot filter uce accompany like fault %v, uce fault time: %s", fault, util.ReadableMsTime(uceFaultTime)) } if modified { - faultdomain.FixUnhealthyInfo(deviceFaultCm) faultdomain.SortDataForAdvanceDeviceInfo(deviceFaultCm) } return newDeviceFaultQue diff --git a/component/clusterd/pkg/common/constant/methods.go b/component/clusterd/pkg/common/constant/methods.go index e637732a0..76cbe8016 100644 --- a/component/clusterd/pkg/common/constant/methods.go +++ b/component/clusterd/pkg/common/constant/methods.go @@ -10,8 +10,111 @@ import ( "ascend-common/api" "ascend-common/common-utils/hwlog" + "clusterd/pkg/common/util" ) +var normalFaultLevel = []string{NotHandleFault, SubHealthFault, NormalNPU, NormalNetwork} + +func (cm *AdvanceDeviceFaultCm) addFaultIntoFaultList(addFault DeviceFault) bool { + if cm.FaultDeviceList == nil { + cm.FaultDeviceList = make(map[string][]DeviceFault) + } + if _, ok := cm.FaultDeviceList[addFault.NPUName]; !ok { + cm.FaultDeviceList[addFault.NPUName] = make([]DeviceFault, 0) + } + deviceFaults := cm.FaultDeviceList[addFault.NPUName] + found := false + for _, fault := range deviceFaults { + if equalDeviceFault(&addFault, &fault) { + found = true + break + } + } + if !found { + deviceFaults = append(deviceFaults, addFault) + } + cm.FaultDeviceList[addFault.NPUName] = deviceFaults + return !found +} + +// AddFault add fault in the AdvanceDeviceFaultCm +// If the fault is more than normalFaultLevel, then should add into CardUnHealthy/NetworkUnhealthy +// And remove from AvailableDeviceList +func (cm *AdvanceDeviceFaultCm) AddFault(fault DeviceFault) { + if !cm.addFaultIntoFaultList(fault) { + return + } + if !slices.Contains(normalFaultLevel, fault.FaultLevel) { + if fault.FaultType == CardUnhealthy || fault.FaultType == PublicFaultType { + cm.AvailableDeviceList = util.DeleteStringSliceItem(cm.AvailableDeviceList, fault.NPUName) + if !slices.Contains(cm.CardUnHealthy, fault.NPUName) { + cm.CardUnHealthy = append(cm.CardUnHealthy, fault.NPUName) + } + } else if fault.FaultType == CardNetworkUnhealthy { + cm.AvailableDeviceList = util.DeleteStringSliceItem(cm.AvailableDeviceList, fault.NPUName) + if !slices.Contains(cm.NetworkUnhealthy, fault.NPUName) { + cm.NetworkUnhealthy = append(cm.NetworkUnhealthy, fault.NPUName) + } + } else { + hwlog.RunLog.Errorf("unrecognizable fault type %s", fault.FaultType) + } + } +} + +func (cm *AdvanceDeviceFaultCm) delFaultFromFaultList(delFault DeviceFault) bool { + if cm.FaultDeviceList == nil { + return false + } + if _, ok := cm.FaultDeviceList[delFault.NPUName]; !ok { + return false + } + deviceFaults := cm.FaultDeviceList[delFault.NPUName] + + newDeviceFaults := make([]DeviceFault, 0) + found := false + for _, fault := range deviceFaults { + if equalDeviceFault(&delFault, &fault) { + found = true + continue + } + newDeviceFaults = append(newDeviceFaults, fault) + } + if len(newDeviceFaults) == 0 { + delete(cm.FaultDeviceList, delFault.NPUName) + } else { + cm.FaultDeviceList[delFault.NPUName] = newDeviceFaults + } + return found +} + +// DelFault delete fault in the AdvanceDeviceFaultCm +// Delete fault cannot add npu into AvailableDeviceList, because some job run on the npu +func (cm *AdvanceDeviceFaultCm) DelFault(fault DeviceFault) { + if !cm.delFaultFromFaultList(fault) { + return + } + deviceFaults := cm.FaultDeviceList[fault.NPUName] + delFromCardUnhealthy := true + delFromCardNetworkUnhealthy := true + for _, devFault := range deviceFaults { + if !slices.Contains(normalFaultLevel, devFault.FaultLevel) { + if devFault.FaultType == CardUnhealthy || devFault.FaultType == PublicFaultType { + delFromCardUnhealthy = false + } else if devFault.FaultType == CardNetworkUnhealthy { + delFromCardNetworkUnhealthy = false + } else { + hwlog.RunLog.Errorf("unrecognizable fault type %s", devFault.FaultType) + } + } + } + if delFromCardUnhealthy { + cm.CardUnHealthy = util.DeleteStringSliceItem(cm.CardUnHealthy, fault.NPUName) + } + if delFromCardNetworkUnhealthy { + cm.NetworkUnhealthy = util.DeleteStringSliceItem(cm.NetworkUnhealthy, fault.NPUName) + } +} + // IsSame compare two AdvanceDeviceFaultCm, do not care UpdateTime func (cm *AdvanceDeviceFaultCm) IsSame(another ConfigMapInterface) bool { if cm == nil { diff --git a/component/clusterd/pkg/domain/faultdomain/fault_utils.go b/component/clusterd/pkg/domain/faultdomain/fault_utils.go index b263ac344..c2df068a2 100644 --- a/component/clusterd/pkg/domain/faultdomain/fault_utils.go +++ b/component/clusterd/pkg/domain/faultdomain/fault_utils.go @@ -251,55 +251,6 @@ func mergeDeviceFault(notGroupDeviceFaults []constant.DeviceFault) ([]constant.D return result, nil } -// DeleteFaultFromFaultMap delete fault from faultMap -func DeleteFaultFromFaultMap(faultMap map[string][]constant.DeviceFault, - delFault constant.DeviceFault) map[string][]constant.DeviceFault { - if faultMap == nil { - return make(map[string][]constant.DeviceFault) - } - deviceFaults, ok := faultMap[delFault.NPUName] - if !ok { - return faultMap - } - newDeviceFaults := make([]constant.DeviceFault, 0) - for _, fault := range deviceFaults { - if reflect.DeepEqual(delFault, fault) { - continue - } - newDeviceFaults = append(newDeviceFaults, fault) - } - if len(newDeviceFaults) == 0 { - delete(faultMap, delFault.NPUName) - } else { - faultMap[delFault.NPUName] = newDeviceFaults - } - return faultMap -} - -// AddFaultIntoFaultMap add fault into faultMap -func AddFaultIntoFaultMap(faultMap map[string][]constant.DeviceFault, - addFault constant.DeviceFault) map[string][]constant.DeviceFault { - if faultMap == nil { - faultMap = make(map[string][]constant.DeviceFault) - } - deviceFaults, ok := faultMap[addFault.NPUName] - if !ok { - deviceFaults = make([]constant.DeviceFault, 0) - } - isExisting := false - for _, fault := range deviceFaults { - if reflect.DeepEqual(addFault, fault) { - isExisting = true - break - } - } - if !isExisting { - deviceFaults = append(deviceFaults, addFault) - } - faultMap[addFault.NPUName] = deviceFaults - return faultMap -} - func AdvanceFaultMapToOriginalFaultMap[U, T constant.ConfigMapInterface](advanceFaultCm map[string]T) map[string]U { orgFaultCm := make(map[string]U) for _, advanceCmForNode := range advanceFaultCm { @@ -334,7 +285,6 @@ func AdvanceDevCmToOrigCm(advanceDeviceCm *constant.AdvanceDeviceFaultCm) *const ServerIndex: advanceDeviceCm.ServerIndex, } - FixUnhealthyInfo(advanceDeviceCm) mergeCode(advanceDeviceCm) orgDeviceCm.DeviceList[advanceDeviceCm.GetFaultDeviceListKey()] = @@ -410,27 +360,6 @@ func mergeCode(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { } } -// FixUnhealthyInfo fix the CardUnHealthy/NetworkUnhealthy/AvailableDevice list -func FixUnhealthyInfo(advanceDeviceCm *constant.AdvanceDeviceFaultCm) { - newCardUnHealthy := make([]string, 0) - newNetworkUnhealthy := make([]string, 0) - availableDeviceList := advanceDeviceCm.AvailableDeviceList - for deviceName, faults := range advanceDeviceCm.FaultDeviceList { - deletableFaultLevels := []string{constant.NotHandleFault, constant.SubHealthFault} - if !isFaultDeletable(faults, []string{constant.CardUnhealthy, constant.PublicFaultType}, deletableFaultLevels) { - newCardUnHealthy = append(newCardUnHealthy, deviceName) - availableDeviceList = util.DeleteStringSliceItem(availableDeviceList, deviceName) - } - if !isFaultDeletable(faults, []string{constant.CardNetworkUnhealthy}, deletableFaultLevels) { - newNetworkUnhealthy = append(newNetworkUnhealthy, deviceName) - availableDeviceList = util.DeleteStringSliceItem(availableDeviceList, deviceName) - } - } - advanceDeviceCm.CardUnHealthy = newCardUnHealthy - advanceDeviceCm.NetworkUnhealthy = newNetworkUnhealthy - advanceDeviceCm.AvailableDeviceList = availableDeviceList -} - func getNetworkUnhealthyString(devInfo *constant.DeviceInfo) (string, string) { key := api.ResourceNamePrefix + GetDeviceType(devInfo) + constant.CmCardNetworkUnhealthySuffix return key, devInfo.DeviceList[key] -- Gitee