diff --git a/component/ascend-device-plugin/pkg/server/FaultDebug.md b/component/ascend-device-plugin/pkg/server/FaultDebug.md new file mode 100644 index 0000000000000000000000000000000000000000..4ffb65a767d4ecc5af832c4b5fc779273e723700 --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/FaultDebug.md @@ -0,0 +1,74 @@ +# 使用方法 + +## 修改源码 + +下载**fault_constructer.go**文件,并复制到 _mind-cluster/component/ascend-device-plugin/pkg/server/_ 路径下。 + +修改文件 _mind-cluster/component/ascend-device-plugin/pkg/server/manager.go_,在ListenDevice方法中增加一行代码: + +```golang +func (hdm *HwDevManager) ListenDevice(ctx context.Context) { + ...... + go hdm.Serve(ctx) + // 增加下一行代码,从configmap中读取并处理故障信息 + hdm.constructNpuFaultByCm(ctx) + ...... +} +``` + +## 创建configmap + +`kubectl create cm mindx-dl-npu-fault-event -n kube-system --from-file="npuFaultCM.json"` + +_npuFaultCM.json_ 文件内容示例如下: + +```json + { + "Node": "XXX", + "PollInterval": 1, + "ReInject": 1, + "Faults": [ + { + "EventID":"0x80E21007", + "LogicID":1, + "Severity":0, + "Assertion":1, + "TimeOffset":[0, 6] + }, + { + "EventID":"0x80E21007", + "LogicID":1, + "Severity":0, + "Assertion":0, + "TimeOffset":[12] + } + ] +} +``` + +字段含义: +* Node:待注入故障节点名 +* PollInterval:device-plugin读取configmap _mindx-dl-npu-fault-event_ 的时间间隔,单位秒 +* ReInject:是否把"Faults"中定义的故障写入dp,1表示是,0表示否 +* Faults:具体故障信息列表 +* EventID: 故障id,可参考faultCode.json文件 +* LogicID: npu卡的逻辑id +* Severity:0,事件级别,0:提示,1:次要(一般),2:重要,3:紧急 +* Assertion:1,事件类型,0:故障恢复,1:故障产生,2:通知类事件 +* TimeOffset: 故障发生的时间偏移量列表,每个偏移量都会对应生成一个故障 + +注意:创建configmap时,确报npuFaultCM.json文件中无注释 + +# 故障注入 + +修改configmap _mindx-dl-npu-fault-event_ 中的 ReInject字段值为1,即可向dp写入故障信息。 + +注意:dp会自动把ReInject置为0。若需要重新注入故障信息,需手动把ReInject重新修改为1。重新注入后,上一次注入的且尚未被处理的故障信息会被丢弃。 + +# 故障恢复 + +分三种类型: + +* 根据芯片当前故障码判断隔离级别:在configmap _mindx-dl-npu-fault-event_ 中配置对应故障恢复信息(Assertion字段置为0) +* 根据芯片的故障码持续时长判断隔离级别:需要等待一段时间恢复,比如网口link down 需要等待60秒 +* 根据芯片的故障码频率判断隔离级别:需要手动清理configmap _mindx-dl-deviceinfo-XXX_ 中的 ManuallySeparateNPU; diff --git a/component/ascend-device-plugin/pkg/server/fault_constructer.go b/component/ascend-device-plugin/pkg/server/fault_constructer.go new file mode 100644 index 0000000000000000000000000000000000000000..df03dcc7eaadd54c3337e2007956cbb6720012e3 --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/fault_constructer.go @@ -0,0 +1,337 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package server holds the implementation of registration to kubelet, k8s pod resource interface. +package server + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "k8s.io/api/core/v1" + + "Ascend-device-plugin/pkg/common" + "Ascend-device-plugin/pkg/kubeclient" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + npuCommon "ascend-common/devmanager/common" +) + +const ( + // FaultEventCMName name of npu fault event configmap + FaultEventCMName = "mindx-dl-npu-fault-event" + // FaultEventCMNameSpace namespace of npu fault event configmap + FaultEventCMNameSpace = "kube-system" + // FaultEventFileKey key of loading npu faults + FaultEventFileKey = "npuFaultCM.json" + // FaultEventCMPollSecInterval interval of polling npu fault event configmap, unit:second + FaultEventCMPollSecInterval = 1 + // FaultCacheSaveToDPMillInterval interval of saving cached npu fault to DP, unit:millisecond + FaultCacheSaveToDPMillInterval = 500 + // ReInjectAllFaultsDefaultValue default value of re-injecting all faults in configmap + ReInjectAllFaultsDefaultValue = 1 + // FaultEventFileAbsPath file absolute path of injecting fault event with file + FaultEventFileAbsPath = "/user/inject/fault/npuFaultFile.json" +) + +var ( + // faultCacheLock is used for devFaultCache which may be used concurrence + faultCacheLock sync.Mutex + devFaultCache []npuCommon.DevFaultInfo +) + +type FaultInfo struct { + EventID string + LogicID int32 + Severity int8 + Assertion int8 + TimeOffset []int64 +} + +type FaultDebugConfig struct { + Node string // When injecting faults through local files, this field does not work + PollInterval int64 + ReInject int + Faults []FaultInfo +} + +func (hdm *HwDevManager) constructNpuFaultByCm(ctx context.Context) { + hwlog.RunLog.Infof("start construct npu fault from cm or file") + if err := hdm.createFaultFile(); err != nil { + hwlog.RunLog.Errorf("create fault file fail, err: %v", err) + } else { + go hdm.loadFaultEventFromFile(ctx) + } + go hdm.pollFaultEventFromCm(ctx) + go hdm.saveCachedFaultToDP(ctx) +} + +func (hdm *HwDevManager) createFaultFile() error { + dir := filepath.Dir(FaultEventFileAbsPath) + if !utils.IsExist(dir) { + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + return fmt.Errorf("mkdir fail, err: %v", err) + } + } + defaultConfig := &FaultDebugConfig{ + PollInterval: FaultEventCMPollSecInterval, + ReInject: 0, + } + return hdm.updateFaultInjectFile(defaultConfig) +} + +func (hdm *HwDevManager) loadFaultEventFromFile(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("load fault event from file stop") + return + default: + interval := int64(FaultEventCMPollSecInterval) + config := hdm.readAndInjectFaultFromFile() + if config != nil && config.PollInterval > 0 { + interval = config.PollInterval + } + time.Sleep(time.Duration(interval) * time.Second) + } + } +} + +func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("poll fault event from cm stop") + return + default: + interval := int64(FaultEventCMPollSecInterval) + config := hdm.pollAndInjectFaultFromCm() + if config != nil && config.PollInterval > 0 { + interval = config.PollInterval + } + time.Sleep(time.Duration(interval) * time.Second) + } + } +} + +func (hdm *HwDevManager) saveCachedFaultToDP(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("save cached fault to dp stop") + return + default: + hdm.injectDevFaultToDp() + time.Sleep(time.Duration(FaultCacheSaveToDPMillInterval) * time.Millisecond) + } + } +} + +func (hdm *HwDevManager) readAndInjectFaultFromFile() *FaultDebugConfig { + config, err := readFaultDebugFileJson() + if err != nil { + hwlog.RunLog.ErrorfWithLimit(FaultEventFileAbsPath, 1, "cannot load fault from '%s' file, reason: %v", FaultEventFileAbsPath, err) + return nil + } + if config.ReInject != ReInjectAllFaultsDefaultValue { + return config + } + + hwlog.RunLog.Infof("ReInject value is '%d' in file, start saving to DP", config.ReInject) + // reset devFaultCache + hdm.updateDevFaultCache(config.Faults) + config.ReInject = 0 + + hdm.updateFaultInjectFile(config) + return config +} + +func (hdm *HwDevManager) pollAndInjectFaultFromCm() *FaultDebugConfig { + + configMap, err := hdm.manager.GetKubeClient().GetConfigMap(FaultEventCMName, FaultEventCMNameSpace) + if err != nil { + hwlog.RunLog.ErrorfWithLimit(FaultEventCMName, 2, "cannot find '%s' configmap, reason: %v", FaultEventCMName, err) + return nil + } + + config, err := parseFaultDebugConfigJson(configMap) + if err != nil || config == nil { + hwlog.RunLog.Error(err) + return nil + } + + if config.ReInject != ReInjectAllFaultsDefaultValue { + return config + } + hwlog.RunLog.Infof("ReInject value is '%d' in CM, start saving to DP", config.ReInject) + + node, err := kubeclient.GetNodeNameFromEnv() + if err != nil || node == "" { + hwlog.RunLog.Errorf("cannot get node from env, reason: %v", err) + return config + } + + if node != config.Node { + hwlog.RunLog.Infof("dont have node '%s' in configmap, target nodes: %s", node, config.Node) + return config + } + + // reset devFaultCache + hdm.updateDevFaultCache(config.Faults) + config.ReInject = 0 + + hdm.updateConfigMap(config, configMap) + + return config +} + +func (hdm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { + tempDevFaultCache := make([]npuCommon.DevFaultInfo, 0) + now := time.Now() + + // save npu device fault + for _, fault := range faultInfos { + eventId, err := convertFaultCodeHexToInt(fault.EventID) + if err != nil { + hwlog.RunLog.Errorf("get fault code fail, reason: %v", err) + continue + } + if len(fault.TimeOffset) == 0 { + fault.TimeOffset = append(fault.TimeOffset, 0) + } + for _, offset := range fault.TimeOffset { + rasedTime := now.Add(time.Duration(offset) * time.Second) + + devFault := npuCommon.DevFaultInfo{ + EventID: eventId, + LogicID: fault.LogicID, + Severity: fault.Severity, + Assertion: fault.Assertion, + AlarmRaisedTime: rasedTime.UnixMilli(), + } + tempDevFaultCache = append(tempDevFaultCache, devFault) + hwlog.RunLog.Infof("add npu fault to dp cache, devFaultInfo: %v, hex code: %v", + devFault, strconv.FormatInt(devFault.EventID, common.Hex)) + } + } + + faultCacheLock.Lock() + hwlog.RunLog.Infof("update cache fault data finished, pre fault cnt: %d, latest fault count: %d", + len(devFaultCache), len(tempDevFaultCache)) + devFaultCache = tempDevFaultCache + faultCacheLock.Unlock() +} + +func (hdm *HwDevManager) injectDevFaultToDp() { + faultCacheLock.Lock() + defer faultCacheLock.Unlock() + + nowTime := time.Now().UnixMilli() + newDevFaultCache := make([]npuCommon.DevFaultInfo, 0) + for _, devFault := range devFaultCache { + if nowTime >= devFault.AlarmRaisedTime { + common.SaveDevFaultInfo(devFault) + continue + } + newDevFaultCache = append(newDevFaultCache, devFault) + } + devFaultCache = newDevFaultCache +} + +func (hdm *HwDevManager) updateConfigMap(config *FaultDebugConfig, configMap *v1.ConfigMap) { + configBytes, err := json.Marshal(*config) + if err != nil { + hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v reason: %v", config, err) + return + } + configMap.Data[FaultEventFileKey] = string(configBytes) + _, err = hdm.manager.GetKubeClient().UpdateConfigMap(configMap) + if err != nil { + hwlog.RunLog.Errorf("update '%s' configmap fail, reason: %v", FaultEventCMName, err) + } +} + +func (hdm *HwDevManager) updateFaultInjectFile(config *FaultDebugConfig) error { + configBytes, err := json.Marshal(*config) + if err != nil { + hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) + return fmt.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) + } + f, err := os.OpenFile(FaultEventFileAbsPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) + if err != nil { + hwlog.RunLog.Errorf("open fault file failed, reason: %v", err) + return fmt.Errorf("open fault file failed, reason: %v", err) + } + defer f.Close() + if _, err = f.WriteString(string(configBytes)); err != nil { + hwlog.RunLog.Errorf("write fault file failed, reason: %v", err) + return fmt.Errorf("write fault file failed, reason: %v", err) + } + return nil +} + +func parseFaultDebugConfigJson(configMap *v1.ConfigMap) (*FaultDebugConfig, error) { + jsonStr, ok := configMap.Data[FaultEventFileKey] + if !ok { + return nil, fmt.Errorf("cannot find data '%s' in CM'", FaultEventFileKey) + } + return convertByteToFaultDebugConfig([]byte(jsonStr)) +} + +func readFaultDebugFileJson() (*FaultDebugConfig, error) { + faultCodeBytes, err := utils.LoadFile(FaultEventFileAbsPath) + if err != nil { + return nil, fmt.Errorf("load fault event json file failed, path: %v, reason: %v", FaultEventFileAbsPath, err) + } + if faultCodeBytes == nil { + return nil, errors.New("the file does not exist or for other reasons, the read data is empty") + } + return convertByteToFaultDebugConfig(faultCodeBytes) +} + +func convertByteToFaultDebugConfig(bytes []byte) (*FaultDebugConfig, error) { + configInfo := &FaultDebugConfig{ + PollInterval: FaultEventCMPollSecInterval, + } + if err := json.Unmarshal(bytes, configInfo); err != nil { + return nil, fmt.Errorf("cannot unmarshal json data, data: %s, reason: %v", string(bytes), err) + } + return configInfo, nil +} + +func convertFaultCodeHexToInt(hexStr string) (int64, error) { + hexStr = strings.TrimPrefix(hexStr, "0x") + codes := common.StringTool.HexStringToInt([]string{hexStr}) + if len(codes) == 0 { + return -1, fmt.Errorf("convert fault code hex string '%s' to int failed", hexStr) + } + return codes[0], nil +} diff --git a/component/ascend-device-plugin/pkg/server/fault_writer.py b/component/ascend-device-plugin/pkg/server/fault_writer.py new file mode 100644 index 0000000000000000000000000000000000000000..fc5592f623b192b59bd06115b86597d84342d52e --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/fault_writer.py @@ -0,0 +1,55 @@ +import os +import json + +class FaultInfo: + def __init__(self, EventID=None, LogicID=None, Severity=None, Assertion=None, TimeOffset=None): + self.EventID = EventID + self.LogicID = LogicID + self.Severity = Severity + self.Assertion = Assertion + self.TimeOffset = TimeOffset + def to_dict(self): + return { + 'EventID': self.EventID, + 'LogicID': self.LogicID, + 'Severity': self.Severity, + 'Assertion': self.Assertion, + 'TimeOffset': self.TimeOffset + } + +class FaultDebugConfig: + def __init__(self, Node=None, PollInterval=None, ReInject=None, Faults=None): + self.Node = Node + self.PollInterval = PollInterval + self.ReInject = ReInject + self.Faults = Faults + +def create_and_write_json_file(): + file_path = "/user/inject/fault/npuFaultFile.json" + Faults=[ + FaultInfo( + EventID="0x80E21007", + LogicID=1, + Severity=0, + Assertion=1, + TimeOffset=[0, 6] + ), + FaultInfo( + EventID="0x80E21007", + LogicID=1, + Severity=0, + Assertion=0, + TimeOffset=[12] + ) + ] + + json_data = FaultDebugConfig( + Node="XXX", + PollInterval=1, + ReInject=1, + Faults=[fault.to_dict() for fault in Faults] + ) + with open(file_path, 'w') as f: + json.dump(json_data.__dict__, f, indent=4) + +# create_and_write_json_file() \ No newline at end of file diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 2fcc20528a7a3835f36609f3132c778a3fbbd945..ac0d7240d9a9898e02484c6605fd69c88ecf6af9 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -378,6 +378,9 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { hdm.separateNPUIDFromDeviceInfoIntoCache() go hdm.pollFaultCodeCM(ctx) go hdm.Serve(ctx) + + hdm.constructNpuFaultByCm(ctx) + if common.ParamOption.CheckCachedPods { go hdm.manager.GetKubeClient().PodInformerInspector(ctx) }