From 90be648d5e0a87b7ab6697469b4735bff64b8014 Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Sat, 30 Nov 2024 15:22:02 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E6=94=AF=E6=8C=81=E9=80=9A=E8=BF=87configm?= =?UTF-8?q?ap=E6=B3=A8=E5=85=A5=E6=95=85=E9=9A=9C=E4=BF=A1=E6=81=AF?= =?UTF-8?q?=EF=BC=8C=E7=94=A8=E4=BA=8Edebug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/FaultDebug.md | 47 ++++ .../pkg/server/fault_constructer.go | 214 ++++++++++++++++++ .../pkg/server/manager.go | 3 + 3 files changed, 264 insertions(+) create mode 100644 component/ascend-device-plugin/pkg/server/FaultDebug.md create mode 100644 component/ascend-device-plugin/pkg/server/fault_constructer.go diff --git a/component/ascend-device-plugin/pkg/server/FaultDebug.md b/component/ascend-device-plugin/pkg/server/FaultDebug.md new file mode 100644 index 000000000..431e0efdd --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/FaultDebug.md @@ -0,0 +1,47 @@ +步骤一:修改源码并编译 + +下载fault_constructer.go文件,并复制到 mindxdl/component/ascend-device-plugin/pkg/server/ 路径下。 + +修改文件 mindxdl/component/ascend-device-plugin/pkg/server/manager.go的代码: + +```golang +func (hdm *HwDevManager) ListenDevice(ctx context.Context) { + ...... + go hdm.pollFaultCodeCM(ctx) + go hdm.Serve(ctx) + // 增加下一行代码,从configmap中读取并处理故障信息 + go hdm.pollFaultEventFromCm(ctx) + ...... +} +``` + +参考 mindxdl/component/ascend-device-plugin/README.md 编译并启动ascend-device-plugin。 + +步骤二:创建configmap,注入故障 + +`kubectl create cm mindx-dl-npu-fault-event -n kube-system --from-file="npuFaultCM.json"` + +npuFaultCM.json文件内容示例如下: + +```json + { + "Node": "XXX", // 表示待注入故障节点名为"XXX" + "PollInterval": 1, // 表示每隔1秒,device-plugin读取一次mindx-dl-npu-fault-event + "ReInject": 1, // 是否把"Faults"中的故障写入dp,"1"表示是,"0"表示否;写入dp后会自动置为"0" + "Faults": [ // 故障信息 + { + "EventID":"0x80E21007", // 故障id,可参考faultCode.json文件 + "LogicID":1, // 待注入故障卡的逻辑id + "Severity":0, // + "Assertion":1, // 0代表FaultRecover,1代表FaultOccur,2代表FaultOnce + "TimeOffset":[0, 6] // 表示dp读取故障后,会生成两个故障信息,分别在第0秒和第6秒分别写入dp。 + } + ] +} +``` + +注意:当ReInject="1"时,dp会重新读取Faults故障信息,读取完后,会自动把ReInject修改为"0",若要重新注入新的故障,需要手动修改ReInject为"1",可直接在configmap中修改: + +`kubectl edit cm mindx-dl-npu-fault-event -n kube-system` + +修改ReInject的值并保存后,会重新注入新的故障到dp中。 diff --git a/component/ascend-device-plugin/pkg/server/fault_constructer.go b/component/ascend-device-plugin/pkg/server/fault_constructer.go new file mode 100644 index 000000000..3bc4d4bda --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/fault_constructer.go @@ -0,0 +1,214 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package server holds the implementation of registration to kubelet, k8s pod resource interface. +package server + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "huawei.com/npu-exporter/v6/common-utils/hwlog" + npuCommon "huawei.com/npu-exporter/v6/devmanager/common" + "k8s.io/api/core/v1" + + "Ascend-device-plugin/pkg/common" + "Ascend-device-plugin/pkg/kubeclient" +) + +const ( + // FaultEventCMName name of npu fault event configmap + FaultEventCMName = "mindx-dl-npu-fault-event" + // FaultEventCMNameSpace namespace of npu fault event configmap + FaultEventCMNameSpace = "kube-system" + // FaultEventFileKey key of loading npu faults + FaultEventFileKey = "npuFaultCM.json" + // FaultEventCMPollInterval interval of polling npu fault event configmap + FaultEventCMPollInterval = 1 + // ReInjectAllFaultsDefaultValue default value of re-injecting all faults in configmap + ReInjectAllFaultsDefaultValue = 1 +) + +var ( + devFaultCache []npuCommon.DevFaultInfo +) + +type FaultInfo struct { + EventID string + LogicID int32 + Severity int8 + Assertion int8 + TimeOffset []int64 +} + +type FaultDebugConfig struct { + Node string + PollInterval int64 + ReInject int + Faults []FaultInfo + updateCM bool +} + +func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("poll fault event cm stop") + return + default: + config := hdm.pollAndInjectFault() + + interval := int64(FaultEventCMPollInterval) + if config != nil && config.PollInterval > 0 { + interval = config.PollInterval + } + time.Sleep(time.Duration(interval) * time.Second) + } + } +} + +func (hdm *HwDevManager) pollAndInjectFault() *FaultDebugConfig { + + configMap, err := hdm.manager.GetKubeClient().GetConfigMap(FaultEventCMName, FaultEventCMNameSpace) + if err != nil { + hwlog.RunLog.Errorf("cannot find '%s' configmap, reason: %v", FaultEventCMName, err) + return nil + } + + node, err := kubeclient.GetNodeNameFromEnv() + if err != nil || node == "" { + hwlog.RunLog.Errorf("cannot get node from env, reason: %v", err) + return nil + } + + config, err := parseFaultDebugConfigJson(configMap) + if err != nil { + hwlog.RunLog.Error(err) + return nil + } + + if node != config.Node { + hwlog.RunLog.Infof("dont have node '%s' in configmap, target nodes: %s", node, config.Node) + return config + } + + // reset devFaultCache + if config.ReInject == ReInjectAllFaultsDefaultValue { + hdm.updateDevFaultCache(config.Faults) + config.ReInject = 0 + config.updateCM = true + } + + hdm.injectDevFaultToDp() + + if config.PollInterval == -1 { + config.PollInterval = FaultEventCMPollInterval + config.updateCM = true + + } + if config.updateCM { + hdm.updateConfigMap(config, configMap) + } + return config +} + +func (hbm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { + devFaultCache = make([]npuCommon.DevFaultInfo, 0) + now := time.Now() + + // save npu device fault + for _, fault := range faultInfos { + eventId, err := convertFaultCodeHexToInt(fault.EventID) + if err != nil { + hwlog.RunLog.Errorf("get fault code fail, reason: %v", err) + continue + } + if len(fault.TimeOffset) == 0 { + fault.TimeOffset = append(fault.TimeOffset, 0) + } + for _, offset := range fault.TimeOffset { + rasedTime := now.Add(time.Duration(offset) * time.Second) + + devFault := npuCommon.DevFaultInfo{ + EventID: eventId, + LogicID: fault.LogicID, + Severity: fault.Severity, + Assertion: fault.Assertion, + AlarmRaisedTime: rasedTime.UnixMilli(), + } + devFaultCache = append(devFaultCache, devFault) + } + } + hwlog.RunLog.Infof("update cache fault data from configmap '%s' finished, fault count: %d", + FaultEventCMName, len(devFaultCache)) +} + +func (hbm *HwDevManager) injectDevFaultToDp() { + nowTime := time.Now().UnixMilli() + newDevFaultCache := make([]npuCommon.DevFaultInfo, 0) + for _, devFault := range devFaultCache { + if nowTime >= devFault.AlarmRaisedTime { + hwlog.RunLog.Infof("inject dev fault info to dp, data: %v", devFault) + common.SaveDevFaultInfo(devFault) + continue + } + newDevFaultCache = append(newDevFaultCache, devFault) + } + devFaultCache = newDevFaultCache +} + +func (hdm *HwDevManager) updateConfigMap(config *FaultDebugConfig, configMap *v1.ConfigMap) { + configBytes, err1 := json.Marshal(*config) + if err1 != nil { + hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err1) + } + configMap.Data[FaultEventFileKey] = string(configBytes) + _, err := hdm.manager.GetKubeClient().UpdateConfigMap(configMap) + if err != nil { + hwlog.RunLog.Errorf("update '%s' configmap fail, err: %v", FaultEventCMName, err) + } +} + +func parseFaultDebugConfigJson(configMap *v1.ConfigMap) (*FaultDebugConfig, error) { + jsonStr, ok := configMap.Data[FaultEventFileKey] + if !ok { + return nil, fmt.Errorf("cannot find data '%s' in CM'", FaultEventFileKey) + } + configInfo := &FaultDebugConfig{ + PollInterval: -1, + ReInject: ReInjectAllFaultsDefaultValue, + updateCM: false, + } + if err := json.Unmarshal([]byte(jsonStr), configInfo); err != nil { + return nil, fmt.Errorf("cannot unmarshal json data '%s' in CM, err: %v", FaultEventFileKey, err) + } + return configInfo, nil +} + +func convertFaultCodeHexToInt(hexStr string) (int64, error) { + if !strings.HasPrefix(hexStr, "0x") { + hexStr = "0x" + hexStr + } + codes := common.StringTool.HexStringToInt([]string{hexStr}) + if len(codes) == 0 { + return -1, fmt.Errorf("convert fault code hex string '%s' to int failed", hexStr) + } + return codes[0], nil +} diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 7681bc372..7f9eeb153 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -378,6 +378,9 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { hdm.separateNPUIDFromDeviceInfoIntoCache() go hdm.pollFaultCodeCM(ctx) go hdm.Serve(ctx) + + go hdm.pollFaultEventFromCm(ctx) + if common.ParamOption.CheckCachedPods { go hdm.manager.GetKubeClient().PodInformerInspector(ctx) } -- Gitee From 780df38ea4a1c61fbeab3df8c081d1c143637ace Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Sat, 30 Nov 2024 18:15:37 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=A3=80=E8=A7=86?= =?UTF-8?q?=E6=84=8F=E8=A7=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/FaultDebug.md | 62 +++++++---- .../pkg/server/fault_constructer.go | 103 +++++++++++------- .../pkg/server/manager.go | 2 +- 3 files changed, 107 insertions(+), 60 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/FaultDebug.md b/component/ascend-device-plugin/pkg/server/FaultDebug.md index 431e0efdd..c8205e425 100644 --- a/component/ascend-device-plugin/pkg/server/FaultDebug.md +++ b/component/ascend-device-plugin/pkg/server/FaultDebug.md @@ -1,47 +1,67 @@ -步骤一:修改源码并编译 +# 使用方法 -下载fault_constructer.go文件,并复制到 mindxdl/component/ascend-device-plugin/pkg/server/ 路径下。 +## 修改源码 -修改文件 mindxdl/component/ascend-device-plugin/pkg/server/manager.go的代码: +下载**fault_constructer.go**文件,并复制到 _mindxdl/component/ascend-device-plugin/pkg/server/_ 路径下。 + +修改文件 _mindxdl/component/ascend-device-plugin/pkg/server/manager.go_,在ListenDevice方法中增加一行代码: ```golang func (hdm *HwDevManager) ListenDevice(ctx context.Context) { ...... - go hdm.pollFaultCodeCM(ctx) go hdm.Serve(ctx) // 增加下一行代码,从configmap中读取并处理故障信息 - go hdm.pollFaultEventFromCm(ctx) + hdm.constructNpuFaultByCm(ctx) ...... } ``` -参考 mindxdl/component/ascend-device-plugin/README.md 编译并启动ascend-device-plugin。 - -步骤二:创建configmap,注入故障 +## 创建configmap `kubectl create cm mindx-dl-npu-fault-event -n kube-system --from-file="npuFaultCM.json"` -npuFaultCM.json文件内容示例如下: +_npuFaultCM.json_ 文件内容示例如下: ```json { - "Node": "XXX", // 表示待注入故障节点名为"XXX" - "PollInterval": 1, // 表示每隔1秒,device-plugin读取一次mindx-dl-npu-fault-event - "ReInject": 1, // 是否把"Faults"中的故障写入dp,"1"表示是,"0"表示否;写入dp后会自动置为"0" - "Faults": [ // 故障信息 + "Node": "XXX", + "PollInterval": 1, + "ReInject": 1, + "Faults": [ { - "EventID":"0x80E21007", // 故障id,可参考faultCode.json文件 - "LogicID":1, // 待注入故障卡的逻辑id - "Severity":0, // - "Assertion":1, // 0代表FaultRecover,1代表FaultOccur,2代表FaultOnce - "TimeOffset":[0, 6] // 表示dp读取故障后,会生成两个故障信息,分别在第0秒和第6秒分别写入dp。 + "EventID":"0x80E21007", + "LogicID":1, + "Severity":0, + "Assertion":1, + "TimeOffset":[0, 6] } ] } ``` -注意:当ReInject="1"时,dp会重新读取Faults故障信息,读取完后,会自动把ReInject修改为"0",若要重新注入新的故障,需要手动修改ReInject为"1",可直接在configmap中修改: +字段含义: +* Node:待注入故障节点名 +* PollInterval:device-plugin读取configmap _mindx-dl-npu-fault-event_ 的时间间隔,单位秒 +* ReInject:是否把"Faults"中定义的故障写入dp,1表示是,0表示否 +* Faults:具体故障信息列表 +* EventID: 故障id,可参考faultCode.json文件 +* LogicID: npu卡的逻辑id +* Severity:0,事件级别,0:提示,1:次要(一般),2:重要,3:紧急 +* Assertion:1,事件类型,0:故障恢复,1:故障产生,2:通知类事件 +* TimeOffset: 故障发生的时间偏移量列表,每个偏移量都会对应生成一个故障 + +注意:创建configmap时,确报npuFaultCM.json文件中无注释 + +# 故障注入 + +修改configmap _mindx-dl-npu-fault-event_ 中的 ReInject字段值为1,即可向dp写入故障信息。 + +注意:dp会自动把ReInject置为0。若需要重新注入故障信息,需手动把ReInject重新修改为1。重新注入后,上一次注入的且尚未被处理的故障信息会被丢弃。 + +# 故障恢复 -`kubectl edit cm mindx-dl-npu-fault-event -n kube-system` +分三种类型: -修改ReInject的值并保存后,会重新注入新的故障到dp中。 +* 根据芯片当前故障码判断隔离级别:在configmap _mindx-dl-npu-fault-event_ 中配置对应故障恢复信息(Assertion字段置为0) +* 根据芯片的故障码持续时长判断隔离级别:需要等待一段时间恢复,比如网口link down 需要等待60秒 +* 根据芯片的故障码频率判断隔离级别:需要手动清理configmap _mindx-dl-deviceinfo-XXX_ 中的 ManuallySeparateNPU; diff --git a/component/ascend-device-plugin/pkg/server/fault_constructer.go b/component/ascend-device-plugin/pkg/server/fault_constructer.go index 3bc4d4bda..a8ace30a5 100644 --- a/component/ascend-device-plugin/pkg/server/fault_constructer.go +++ b/component/ascend-device-plugin/pkg/server/fault_constructer.go @@ -19,7 +19,9 @@ import ( "context" "encoding/json" "fmt" + "strconv" "strings" + "sync" "time" "huawei.com/npu-exporter/v6/common-utils/hwlog" @@ -37,14 +39,18 @@ const ( FaultEventCMNameSpace = "kube-system" // FaultEventFileKey key of loading npu faults FaultEventFileKey = "npuFaultCM.json" - // FaultEventCMPollInterval interval of polling npu fault event configmap - FaultEventCMPollInterval = 1 + // FaultEventCMPollSecInterval interval of polling npu fault event configmap, unit:second + FaultEventCMPollSecInterval = 1 + // FaultCacheSaveToDPMillInterval interval of saving cached npu fault to DP, unit:millisecond + FaultCacheSaveToDPMillInterval = 500 // ReInjectAllFaultsDefaultValue default value of re-injecting all faults in configmap ReInjectAllFaultsDefaultValue = 1 ) var ( - devFaultCache []npuCommon.DevFaultInfo + // faultCacheLock is used for devFaultCache which may be used concurrence + faultCacheLock sync.Mutex + devFaultCache []npuCommon.DevFaultInfo ) type FaultInfo struct { @@ -60,7 +66,14 @@ type FaultDebugConfig struct { PollInterval int64 ReInject int Faults []FaultInfo - updateCM bool +} + +func (hdm *HwDevManager) constructNpuFaultByCm(ctx context.Context) { + hwlog.RunLog.Infof("start construct npu fault from cm") + + go hdm.pollFaultEventFromCm(ctx) + go hdm.SaveCachedFaultToDP(ctx) + } func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { @@ -74,8 +87,7 @@ func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { return default: config := hdm.pollAndInjectFault() - - interval := int64(FaultEventCMPollInterval) + interval := int64(FaultEventCMPollSecInterval) if config != nil && config.PollInterval > 0 { interval = config.PollInterval } @@ -84,17 +96,27 @@ func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { } } +func (hdm *HwDevManager) SaveCachedFaultToDP(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("save cached fault to dp stop") + return + default: + hdm.injectDevFaultToDp() + time.Sleep(time.Duration(FaultCacheSaveToDPMillInterval) * time.Millisecond) + } + } +} + func (hdm *HwDevManager) pollAndInjectFault() *FaultDebugConfig { configMap, err := hdm.manager.GetKubeClient().GetConfigMap(FaultEventCMName, FaultEventCMNameSpace) if err != nil { - hwlog.RunLog.Errorf("cannot find '%s' configmap, reason: %v", FaultEventCMName, err) - return nil - } - - node, err := kubeclient.GetNodeNameFromEnv() - if err != nil || node == "" { - hwlog.RunLog.Errorf("cannot get node from env, reason: %v", err) + hwlog.RunLog.Infof("cannot find '%s' configmap, reason: %v", FaultEventCMName, err) return nil } @@ -104,33 +126,33 @@ func (hdm *HwDevManager) pollAndInjectFault() *FaultDebugConfig { return nil } + if config.ReInject != ReInjectAllFaultsDefaultValue { + return config + } + hwlog.RunLog.Infof("ReInject value is '%d' in CM, start saving to DP", config.ReInject) + + node, err := kubeclient.GetNodeNameFromEnv() + if err != nil || node == "" { + hwlog.RunLog.Errorf("cannot get node from env, reason: %v", err) + return config + } + if node != config.Node { hwlog.RunLog.Infof("dont have node '%s' in configmap, target nodes: %s", node, config.Node) return config } // reset devFaultCache - if config.ReInject == ReInjectAllFaultsDefaultValue { - hdm.updateDevFaultCache(config.Faults) - config.ReInject = 0 - config.updateCM = true - } + hdm.updateDevFaultCache(config.Faults) + config.ReInject = 0 - hdm.injectDevFaultToDp() + hdm.updateConfigMap(config, configMap) - if config.PollInterval == -1 { - config.PollInterval = FaultEventCMPollInterval - config.updateCM = true - - } - if config.updateCM { - hdm.updateConfigMap(config, configMap) - } return config } func (hbm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { - devFaultCache = make([]npuCommon.DevFaultInfo, 0) + tempDevFaultCache := make([]npuCommon.DevFaultInfo, 0) now := time.Now() // save npu device fault @@ -153,19 +175,27 @@ func (hbm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { Assertion: fault.Assertion, AlarmRaisedTime: rasedTime.UnixMilli(), } - devFaultCache = append(devFaultCache, devFault) + tempDevFaultCache = append(tempDevFaultCache, devFault) + hwlog.RunLog.Infof("add npu fault to dp cache, devFaultInfo: %v, hex code: %v", + devFault, strconv.FormatInt(devFault.EventID, common.Hex)) } } - hwlog.RunLog.Infof("update cache fault data from configmap '%s' finished, fault count: %d", - FaultEventCMName, len(devFaultCache)) + + faultCacheLock.Lock() + hwlog.RunLog.Infof("update cache fault data from configmap '%s' finished, pre fault cnt: %d, latest fault count: %d", + FaultEventCMName, len(devFaultCache), len(tempDevFaultCache)) + devFaultCache = tempDevFaultCache + faultCacheLock.Unlock() } func (hbm *HwDevManager) injectDevFaultToDp() { + faultCacheLock.Lock() + defer faultCacheLock.Unlock() + nowTime := time.Now().UnixMilli() newDevFaultCache := make([]npuCommon.DevFaultInfo, 0) for _, devFault := range devFaultCache { if nowTime >= devFault.AlarmRaisedTime { - hwlog.RunLog.Infof("inject dev fault info to dp, data: %v", devFault) common.SaveDevFaultInfo(devFault) continue } @@ -178,6 +208,7 @@ func (hdm *HwDevManager) updateConfigMap(config *FaultDebugConfig, configMap *v1 configBytes, err1 := json.Marshal(*config) if err1 != nil { hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err1) + return } configMap.Data[FaultEventFileKey] = string(configBytes) _, err := hdm.manager.GetKubeClient().UpdateConfigMap(configMap) @@ -192,9 +223,7 @@ func parseFaultDebugConfigJson(configMap *v1.ConfigMap) (*FaultDebugConfig, erro return nil, fmt.Errorf("cannot find data '%s' in CM'", FaultEventFileKey) } configInfo := &FaultDebugConfig{ - PollInterval: -1, - ReInject: ReInjectAllFaultsDefaultValue, - updateCM: false, + PollInterval: FaultEventCMPollSecInterval, } if err := json.Unmarshal([]byte(jsonStr), configInfo); err != nil { return nil, fmt.Errorf("cannot unmarshal json data '%s' in CM, err: %v", FaultEventFileKey, err) @@ -203,9 +232,7 @@ func parseFaultDebugConfigJson(configMap *v1.ConfigMap) (*FaultDebugConfig, erro } func convertFaultCodeHexToInt(hexStr string) (int64, error) { - if !strings.HasPrefix(hexStr, "0x") { - hexStr = "0x" + hexStr - } + hexStr = strings.TrimPrefix(hexStr, "0x") codes := common.StringTool.HexStringToInt([]string{hexStr}) if len(codes) == 0 { return -1, fmt.Errorf("convert fault code hex string '%s' to int failed", hexStr) diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 7f9eeb153..dee872d2d 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -379,7 +379,7 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { go hdm.pollFaultCodeCM(ctx) go hdm.Serve(ctx) - go hdm.pollFaultEventFromCm(ctx) + hdm.constructNpuFaultByCm(ctx) if common.ParamOption.CheckCachedPods { go hdm.manager.GetKubeClient().PodInformerInspector(ctx) -- Gitee From 986ca88d9f16e0b0ceaf074a9ca7858390a7b3fa Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Fri, 3 Jan 2025 10:48:41 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E3=80=90device-plugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91=E5=88=87=E6=8D=A2?= =?UTF-8?q?=E5=88=B0ascend-common=E5=92=8Cmind-cluster?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/ascend-device-plugin/pkg/server/FaultDebug.md | 4 ++-- .../ascend-device-plugin/pkg/server/fault_constructer.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/FaultDebug.md b/component/ascend-device-plugin/pkg/server/FaultDebug.md index c8205e425..b726f1b33 100644 --- a/component/ascend-device-plugin/pkg/server/FaultDebug.md +++ b/component/ascend-device-plugin/pkg/server/FaultDebug.md @@ -2,9 +2,9 @@ ## 修改源码 -下载**fault_constructer.go**文件,并复制到 _mindxdl/component/ascend-device-plugin/pkg/server/_ 路径下。 +下载**fault_constructer.go**文件,并复制到 _mind-cluster/component/ascend-device-plugin/pkg/server/_ 路径下。 -修改文件 _mindxdl/component/ascend-device-plugin/pkg/server/manager.go_,在ListenDevice方法中增加一行代码: +修改文件 _mind-cluster/component/ascend-device-plugin/pkg/server/manager.go_,在ListenDevice方法中增加一行代码: ```golang func (hdm *HwDevManager) ListenDevice(ctx context.Context) { diff --git a/component/ascend-device-plugin/pkg/server/fault_constructer.go b/component/ascend-device-plugin/pkg/server/fault_constructer.go index a8ace30a5..8e591c97f 100644 --- a/component/ascend-device-plugin/pkg/server/fault_constructer.go +++ b/component/ascend-device-plugin/pkg/server/fault_constructer.go @@ -24,12 +24,12 @@ import ( "sync" "time" - "huawei.com/npu-exporter/v6/common-utils/hwlog" - npuCommon "huawei.com/npu-exporter/v6/devmanager/common" "k8s.io/api/core/v1" "Ascend-device-plugin/pkg/common" "Ascend-device-plugin/pkg/kubeclient" + "ascend-common/common-utils/hwlog" + npuCommon "ascend-common/devmanager/common" ) const ( -- Gitee From 17f14aa9537f8cf01a9b3e1473b1746b086e7b67 Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Mon, 6 Jan 2025 15:45:14 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=E3=80=90device-plugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E6=B3=A8=E5=85=A5=E6=95=85=E9=9A=9C=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/FaultDebug.md | 7 + .../pkg/server/fault_constructer.go | 131 +++++++++++++++--- .../pkg/server/fault_writer.py | 55 ++++++++ 3 files changed, 174 insertions(+), 19 deletions(-) create mode 100644 component/ascend-device-plugin/pkg/server/fault_writer.py diff --git a/component/ascend-device-plugin/pkg/server/FaultDebug.md b/component/ascend-device-plugin/pkg/server/FaultDebug.md index b726f1b33..4ffb65a76 100644 --- a/component/ascend-device-plugin/pkg/server/FaultDebug.md +++ b/component/ascend-device-plugin/pkg/server/FaultDebug.md @@ -34,6 +34,13 @@ _npuFaultCM.json_ 文件内容示例如下: "Severity":0, "Assertion":1, "TimeOffset":[0, 6] + }, + { + "EventID":"0x80E21007", + "LogicID":1, + "Severity":0, + "Assertion":0, + "TimeOffset":[12] } ] } diff --git a/component/ascend-device-plugin/pkg/server/fault_constructer.go b/component/ascend-device-plugin/pkg/server/fault_constructer.go index 8e591c97f..44d550e5c 100644 --- a/component/ascend-device-plugin/pkg/server/fault_constructer.go +++ b/component/ascend-device-plugin/pkg/server/fault_constructer.go @@ -18,7 +18,9 @@ package server import ( "context" "encoding/json" + "errors" "fmt" + "os" "strconv" "strings" "sync" @@ -29,6 +31,7 @@ import ( "Ascend-device-plugin/pkg/common" "Ascend-device-plugin/pkg/kubeclient" "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" npuCommon "ascend-common/devmanager/common" ) @@ -45,6 +48,8 @@ const ( FaultCacheSaveToDPMillInterval = 500 // ReInjectAllFaultsDefaultValue default value of re-injecting all faults in configmap ReInjectAllFaultsDefaultValue = 1 + // FaultEventFileAbsPath file absolute path of injecting fault event with file + FaultEventFileAbsPath = "/user/inject/fault/npuFaultFile.json" ) var ( @@ -62,18 +67,53 @@ type FaultInfo struct { } type FaultDebugConfig struct { - Node string + Node string // When injecting faults through local files, this field does not work PollInterval int64 ReInject int Faults []FaultInfo } func (hdm *HwDevManager) constructNpuFaultByCm(ctx context.Context) { - hwlog.RunLog.Infof("start construct npu fault from cm") + hwlog.RunLog.Infof("start construct npu fault from cm or file") + if err := hdm.createFaultFile(); err != nil { + hwlog.RunLog.Errorf("create fault file fail, err: %v", err) + return + } + go hdm.loadFaultEventFromFile(ctx) go hdm.pollFaultEventFromCm(ctx) - go hdm.SaveCachedFaultToDP(ctx) + go hdm.saveCachedFaultToDP(ctx) +} +func (hdm *HwDevManager) createFaultFile() error { + if utils.IsExist(FaultEventFileAbsPath) { + return nil + } + defaultConfig := &FaultDebugConfig{ + PollInterval: FaultEventCMPollSecInterval, + ReInject: 0, + } + return hdm.updateFaultInjectFile(defaultConfig) +} + +func (hdm *HwDevManager) loadFaultEventFromFile(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("load fault event from file stop") + return + default: + interval := int64(FaultEventCMPollSecInterval) + config := hdm.readAndInjectFaultFromFile() + if config != nil && config.PollInterval > 0 { + interval = config.PollInterval + } + time.Sleep(time.Duration(interval) * time.Second) + } + } } func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { @@ -83,11 +123,11 @@ func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { if !ok { hwlog.RunLog.Info("stop signal channel closed") } - hwlog.RunLog.Info("poll fault event cm stop") + hwlog.RunLog.Info("poll fault event from cm stop") return default: - config := hdm.pollAndInjectFault() interval := int64(FaultEventCMPollSecInterval) + config := hdm.pollAndInjectFaultFromCm() if config != nil && config.PollInterval > 0 { interval = config.PollInterval } @@ -96,7 +136,7 @@ func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { } } -func (hdm *HwDevManager) SaveCachedFaultToDP(ctx context.Context) { +func (hdm *HwDevManager) saveCachedFaultToDP(ctx context.Context) { for { select { case _, ok := <-ctx.Done(): @@ -112,11 +152,30 @@ func (hdm *HwDevManager) SaveCachedFaultToDP(ctx context.Context) { } } -func (hdm *HwDevManager) pollAndInjectFault() *FaultDebugConfig { +func (hdm *HwDevManager) readAndInjectFaultFromFile() *FaultDebugConfig { + config, err := readFaultDebugFileJson() + if err != nil { + hwlog.RunLog.ErrorfWithLimit(FaultEventFileAbsPath, 1, "cannot load fault from '%s' file, reason: %v", FaultEventFileAbsPath, err) + return nil + } + if config.ReInject != ReInjectAllFaultsDefaultValue { + return config + } + + hwlog.RunLog.Infof("ReInject value is '%d' in file, start saving to DP", config.ReInject) + // reset devFaultCache + hdm.updateDevFaultCache(config.Faults) + config.ReInject = 0 + + hdm.updateFaultInjectFile(config) + return config +} + +func (hdm *HwDevManager) pollAndInjectFaultFromCm() *FaultDebugConfig { configMap, err := hdm.manager.GetKubeClient().GetConfigMap(FaultEventCMName, FaultEventCMNameSpace) if err != nil { - hwlog.RunLog.Infof("cannot find '%s' configmap, reason: %v", FaultEventCMName, err) + hwlog.RunLog.ErrorfWithLimit(FaultEventCMName, 1, "cannot find '%s' configmap, reason: %v", FaultEventCMName, err) return nil } @@ -151,7 +210,7 @@ func (hdm *HwDevManager) pollAndInjectFault() *FaultDebugConfig { return config } -func (hbm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { +func (hdm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { tempDevFaultCache := make([]npuCommon.DevFaultInfo, 0) now := time.Now() @@ -182,13 +241,13 @@ func (hbm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { } faultCacheLock.Lock() - hwlog.RunLog.Infof("update cache fault data from configmap '%s' finished, pre fault cnt: %d, latest fault count: %d", - FaultEventCMName, len(devFaultCache), len(tempDevFaultCache)) + hwlog.RunLog.Infof("update cache fault data finished, pre fault cnt: %d, latest fault count: %d", + len(devFaultCache), len(tempDevFaultCache)) devFaultCache = tempDevFaultCache faultCacheLock.Unlock() } -func (hbm *HwDevManager) injectDevFaultToDp() { +func (hdm *HwDevManager) injectDevFaultToDp() { faultCacheLock.Lock() defer faultCacheLock.Unlock() @@ -205,16 +264,35 @@ func (hbm *HwDevManager) injectDevFaultToDp() { } func (hdm *HwDevManager) updateConfigMap(config *FaultDebugConfig, configMap *v1.ConfigMap) { - configBytes, err1 := json.Marshal(*config) - if err1 != nil { - hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err1) + configBytes, err := json.Marshal(*config) + if err != nil { + hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v reason: %v", config, err) return } configMap.Data[FaultEventFileKey] = string(configBytes) - _, err := hdm.manager.GetKubeClient().UpdateConfigMap(configMap) + _, err = hdm.manager.GetKubeClient().UpdateConfigMap(configMap) + if err != nil { + hwlog.RunLog.Errorf("update '%s' configmap fail, reason: %v", FaultEventCMName, err) + } +} + +func (hdm *HwDevManager) updateFaultInjectFile(config *FaultDebugConfig) error { + configBytes, err := json.Marshal(*config) if err != nil { - hwlog.RunLog.Errorf("update '%s' configmap fail, err: %v", FaultEventCMName, err) + hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) + return fmt.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) } + f, err := os.OpenFile(FaultEventFileAbsPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + hwlog.RunLog.Errorf("open fault file failed, reason: %v", err) + return fmt.Errorf("open fault file failed, reason: %v", err) + } + defer f.Close() + if _, err = f.WriteString(string(configBytes)); err != nil { + hwlog.RunLog.Errorf("write fault file failed, reason: %v", err) + return fmt.Errorf("write fault file failed, reason: %v", err) + } + return nil } func parseFaultDebugConfigJson(configMap *v1.ConfigMap) (*FaultDebugConfig, error) { @@ -222,11 +300,26 @@ func parseFaultDebugConfigJson(configMap *v1.ConfigMap) (*FaultDebugConfig, erro if !ok { return nil, fmt.Errorf("cannot find data '%s' in CM'", FaultEventFileKey) } + return convertByteToFaultDebugConfig([]byte(jsonStr)) +} + +func readFaultDebugFileJson() (*FaultDebugConfig, error) { + faultCodeBytes, err := utils.LoadFile(FaultEventFileAbsPath) + if err != nil { + return nil, fmt.Errorf("load fault event json file failed, path: %v, reason: %v", FaultEventFileAbsPath, err) + } + if faultCodeBytes == nil { + return nil, errors.New("the file does not exist or for other reasons, the read data is empty") + } + return convertByteToFaultDebugConfig(faultCodeBytes) +} + +func convertByteToFaultDebugConfig(bytes []byte) (*FaultDebugConfig, error) { configInfo := &FaultDebugConfig{ PollInterval: FaultEventCMPollSecInterval, } - if err := json.Unmarshal([]byte(jsonStr), configInfo); err != nil { - return nil, fmt.Errorf("cannot unmarshal json data '%s' in CM, err: %v", FaultEventFileKey, err) + if err := json.Unmarshal(bytes, configInfo); err != nil { + return nil, fmt.Errorf("cannot unmarshal json data, data: %s, reason: %v", string(bytes), err) } return configInfo, nil } diff --git a/component/ascend-device-plugin/pkg/server/fault_writer.py b/component/ascend-device-plugin/pkg/server/fault_writer.py new file mode 100644 index 000000000..fc5592f62 --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/fault_writer.py @@ -0,0 +1,55 @@ +import os +import json + +class FaultInfo: + def __init__(self, EventID=None, LogicID=None, Severity=None, Assertion=None, TimeOffset=None): + self.EventID = EventID + self.LogicID = LogicID + self.Severity = Severity + self.Assertion = Assertion + self.TimeOffset = TimeOffset + def to_dict(self): + return { + 'EventID': self.EventID, + 'LogicID': self.LogicID, + 'Severity': self.Severity, + 'Assertion': self.Assertion, + 'TimeOffset': self.TimeOffset + } + +class FaultDebugConfig: + def __init__(self, Node=None, PollInterval=None, ReInject=None, Faults=None): + self.Node = Node + self.PollInterval = PollInterval + self.ReInject = ReInject + self.Faults = Faults + +def create_and_write_json_file(): + file_path = "/user/inject/fault/npuFaultFile.json" + Faults=[ + FaultInfo( + EventID="0x80E21007", + LogicID=1, + Severity=0, + Assertion=1, + TimeOffset=[0, 6] + ), + FaultInfo( + EventID="0x80E21007", + LogicID=1, + Severity=0, + Assertion=0, + TimeOffset=[12] + ) + ] + + json_data = FaultDebugConfig( + Node="XXX", + PollInterval=1, + ReInject=1, + Faults=[fault.to_dict() for fault in Faults] + ) + with open(file_path, 'w') as f: + json.dump(json_data.__dict__, f, indent=4) + +# create_and_write_json_file() \ No newline at end of file -- Gitee From 4775dc1d9ae01becf87694350d338cc77addeff4 Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Tue, 18 Feb 2025 16:03:19 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E3=80=90device-plugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E6=B3=A8=E5=85=A5=E6=95=85=E9=9A=9C=E6=96=B9?= =?UTF-8?q?=E5=BC=8F,=E4=BF=AE=E6=94=B9=E5=88=9B=E5=BB=BA=E6=96=87?= =?UTF-8?q?=E4=BB=B6bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/fault_constructer.go | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/fault_constructer.go b/component/ascend-device-plugin/pkg/server/fault_constructer.go index 44d550e5c..df03dcc7e 100644 --- a/component/ascend-device-plugin/pkg/server/fault_constructer.go +++ b/component/ascend-device-plugin/pkg/server/fault_constructer.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "os" + "path/filepath" "strconv" "strings" "sync" @@ -77,17 +78,19 @@ func (hdm *HwDevManager) constructNpuFaultByCm(ctx context.Context) { hwlog.RunLog.Infof("start construct npu fault from cm or file") if err := hdm.createFaultFile(); err != nil { hwlog.RunLog.Errorf("create fault file fail, err: %v", err) - return + } else { + go hdm.loadFaultEventFromFile(ctx) } - - go hdm.loadFaultEventFromFile(ctx) go hdm.pollFaultEventFromCm(ctx) go hdm.saveCachedFaultToDP(ctx) } func (hdm *HwDevManager) createFaultFile() error { - if utils.IsExist(FaultEventFileAbsPath) { - return nil + dir := filepath.Dir(FaultEventFileAbsPath) + if !utils.IsExist(dir) { + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + return fmt.Errorf("mkdir fail, err: %v", err) + } } defaultConfig := &FaultDebugConfig{ PollInterval: FaultEventCMPollSecInterval, @@ -175,12 +178,12 @@ func (hdm *HwDevManager) pollAndInjectFaultFromCm() *FaultDebugConfig { configMap, err := hdm.manager.GetKubeClient().GetConfigMap(FaultEventCMName, FaultEventCMNameSpace) if err != nil { - hwlog.RunLog.ErrorfWithLimit(FaultEventCMName, 1, "cannot find '%s' configmap, reason: %v", FaultEventCMName, err) + hwlog.RunLog.ErrorfWithLimit(FaultEventCMName, 2, "cannot find '%s' configmap, reason: %v", FaultEventCMName, err) return nil } config, err := parseFaultDebugConfigJson(configMap) - if err != nil { + if err != nil || config == nil { hwlog.RunLog.Error(err) return nil } @@ -282,7 +285,7 @@ func (hdm *HwDevManager) updateFaultInjectFile(config *FaultDebugConfig) error { hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) return fmt.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) } - f, err := os.OpenFile(FaultEventFileAbsPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + f, err := os.OpenFile(FaultEventFileAbsPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) if err != nil { hwlog.RunLog.Errorf("open fault file failed, reason: %v", err) return fmt.Errorf("open fault file failed, reason: %v", err) -- Gitee