diff --git a/component/ascend-device-plugin/main.go b/component/ascend-device-plugin/main.go index 84daea3f4197b7cb4a92715227e50b361e35ad6d..1f190ada566c98a97507e7d7e8622c21ff38f295 100644 --- a/component/ascend-device-plugin/main.go +++ b/component/ascend-device-plugin/main.go @@ -33,7 +33,7 @@ const ( defaultLogPath = "/var/log/mindx-dl/devicePlugin/devicePlugin.log" // defaultListWatchPeriod is the default listening device state's period - defaultListWatchPeriod = 5 + defaultListWatchPeriod = 60 // maxListWatchPeriod is the max listening device state's period maxListWatchPeriod = 60 @@ -59,7 +59,7 @@ var ( edgeLogFile = flag.String("edgeLogFile", "/var/alog/AtlasEdge_log/devicePlugin.log", "Log file path in edge scene") listWatchPeriod = flag.Int("listWatchPeriod", defaultListWatchPeriod, - "Listen and watch device state's period, unit second, range [3, 60]") + "Listen and watch device state's period, unit second, range [3, 1800]") autoStowing = flag.Bool("autoStowing", true, "Whether to automatically stow the fixed device") logLevel = flag.Int("logLevel", 0, "Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)") diff --git a/component/ascend-device-plugin/pkg/common/common.go b/component/ascend-device-plugin/pkg/common/common.go index 2af5770f5929b7969dda546ae2a31cd174260f1b..02e02760ce9a5ee5a3f9ca66700265ff3f5871e1 100644 --- a/component/ascend-device-plugin/pkg/common/common.go +++ b/component/ascend-device-plugin/pkg/common/common.go @@ -52,6 +52,8 @@ var ( "ascend310": regexp.MustCompile(`^Ascend310-\d+`), "ascend310P": regexp.MustCompile(`^Ascend310P-\d+`), } + // UpdateTriggerChan is a channel to trigger device info update + UpdateTriggerChan = make(chan struct{}, 1) ) // ServerInfo used for pass parameters diff --git a/component/ascend-device-plugin/pkg/common/fault_code.go b/component/ascend-device-plugin/pkg/common/fault_code.go index d2a8463d1313a422109288be43ec70fef1d90aae..5eb8aa585228dce9a154bf53a6626298bd8c223f 100644 --- a/component/ascend-device-plugin/pkg/common/fault_code.go +++ b/component/ascend-device-plugin/pkg/common/fault_code.go @@ -82,6 +82,8 @@ const ( PollingInterval time.Duration = DefaultPollingInterval // SubHealthFault subHealth code SubHealthFault = "SubHealthFault" + // writeInterval is the interval time used in writing fault chan + writeInterval = time.Second ) var ( @@ -124,6 +126,8 @@ var ( RestartNPU, PreSeparateNPU, SeparateNPU, SubHealthFault) // NetworkFaultCodes is a set that contains all the network fault codes NetworkFaultCodes = sets.NewInt64(LinkDownFaultCode) + // lastWriteTime previous write time + lastWriteTime time.Time ) // fault customization @@ -1184,6 +1188,19 @@ func SaveDevFaultInfo(devFaultInfo common.DevFaultInfo) { devFaultInfoMapLock.Lock() devFaultInfoMap[devFaultInfo.LogicID] = append(devFaultInfoMap[devFaultInfo.LogicID], devFaultInfo) devFaultInfoMapLock.Unlock() + + if time.Since(lastWriteTime) < writeInterval { + hwlog.RunLog.Debug("It has been less than one second since the last processing. Skipping processing.") + return + } + + select { + case UpdateTriggerChan <- struct{}{}: + lastWriteTime = time.Now() + hwlog.RunLog.Debug("Triggered fault processing") + default: + hwlog.RunLog.Debug("Fault processing is already in progress") + } } // GetAndCleanFaultInfo get device fault info and clean cache diff --git a/component/ascend-device-plugin/pkg/kubeclient/pod_informer.go b/component/ascend-device-plugin/pkg/kubeclient/pod_informer.go new file mode 100644 index 0000000000000000000000000000000000000000..049be2802a253a1383ebd7772ad0610131776931 --- /dev/null +++ b/component/ascend-device-plugin/pkg/kubeclient/pod_informer.go @@ -0,0 +1,67 @@ +package kubeclient + +import ( + "context" + "time" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/client-go/tools/cache" + + "Ascend-device-plugin/pkg/common" + "ascend-common/common-utils/hwlog" +) + +// StartPodCreationWatcher starts watching Pod creation events on this node +func (ki *ClientK8s) StartPodCreationWatcher() { + // Create Pod ListWatch object, only watching Pods on this node + listWatch := cache.NewListWatchFromClient(ki.Clientset.CoreV1().RESTClient(), "pods", v1.NamespaceAll, + fields.OneTermEqualSelector("spec.nodeName", ki.NodeName)) + + // Define event handler functions + _, controller := cache.NewInformer( + listWatch, + &v1.Pod{}, + time.Minute*1, + cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + pod := obj.(*v1.Pod) + hwlog.RunLog.Infof("New Pod created on node %s: %s", ki.NodeName, pod.Name) + triggerUpdate() + }, + UpdateFunc: func(oldObj, newObj interface{}) { + oldPod := oldObj.(*v1.Pod) + newPod := newObj.(*v1.Pod) + hwlog.RunLog.Infof("Pod updated on node %s: %s (old phase: %s, new phase: %s)", + ki.NodeName, newPod.Name, oldPod.Status.Phase, newPod.Status.Phase) + triggerUpdate() + }, + DeleteFunc: func(obj interface{}) { + pod := obj.(*v1.Pod) + hwlog.RunLog.Infof("Pod deleted from node %s: %s", ki.NodeName, pod.Name) + triggerUpdate() + }, + }, + ) + + // Start the controller + stopCh := context.Background().Done() + go controller.Run(stopCh) + + // Wait for controller to sync + if !cache.WaitForCacheSync(stopCh, controller.HasSynced) { + hwlog.RunLog.Info("Timed out waiting for caches to sync") + return + } + + hwlog.RunLog.Info("Pod creation watcher started") +} + +func triggerUpdate() { + select { + case common.UpdateTriggerChan <- struct{}{}: + hwlog.RunLog.Debug("Triggered fault processing") + default: + hwlog.RunLog.Debug("Fault processing is already in progress") + } +} diff --git a/component/ascend-device-plugin/pkg/server/fault_constructor.go b/component/ascend-device-plugin/pkg/server/fault_constructor.go new file mode 100644 index 0000000000000000000000000000000000000000..df03dcc7eaadd54c3337e2007956cbb6720012e3 --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/fault_constructor.go @@ -0,0 +1,337 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package server holds the implementation of registration to kubelet, k8s pod resource interface. +package server + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "k8s.io/api/core/v1" + + "Ascend-device-plugin/pkg/common" + "Ascend-device-plugin/pkg/kubeclient" + "ascend-common/common-utils/hwlog" + "ascend-common/common-utils/utils" + npuCommon "ascend-common/devmanager/common" +) + +const ( + // FaultEventCMName name of npu fault event configmap + FaultEventCMName = "mindx-dl-npu-fault-event" + // FaultEventCMNameSpace namespace of npu fault event configmap + FaultEventCMNameSpace = "kube-system" + // FaultEventFileKey key of loading npu faults + FaultEventFileKey = "npuFaultCM.json" + // FaultEventCMPollSecInterval interval of polling npu fault event configmap, unit:second + FaultEventCMPollSecInterval = 1 + // FaultCacheSaveToDPMillInterval interval of saving cached npu fault to DP, unit:millisecond + FaultCacheSaveToDPMillInterval = 500 + // ReInjectAllFaultsDefaultValue default value of re-injecting all faults in configmap + ReInjectAllFaultsDefaultValue = 1 + // FaultEventFileAbsPath file absolute path of injecting fault event with file + FaultEventFileAbsPath = "/user/inject/fault/npuFaultFile.json" +) + +var ( + // faultCacheLock is used for devFaultCache which may be used concurrence + faultCacheLock sync.Mutex + devFaultCache []npuCommon.DevFaultInfo +) + +type FaultInfo struct { + EventID string + LogicID int32 + Severity int8 + Assertion int8 + TimeOffset []int64 +} + +type FaultDebugConfig struct { + Node string // When injecting faults through local files, this field does not work + PollInterval int64 + ReInject int + Faults []FaultInfo +} + +func (hdm *HwDevManager) constructNpuFaultByCm(ctx context.Context) { + hwlog.RunLog.Infof("start construct npu fault from cm or file") + if err := hdm.createFaultFile(); err != nil { + hwlog.RunLog.Errorf("create fault file fail, err: %v", err) + } else { + go hdm.loadFaultEventFromFile(ctx) + } + go hdm.pollFaultEventFromCm(ctx) + go hdm.saveCachedFaultToDP(ctx) +} + +func (hdm *HwDevManager) createFaultFile() error { + dir := filepath.Dir(FaultEventFileAbsPath) + if !utils.IsExist(dir) { + if err := os.MkdirAll(dir, os.ModePerm); err != nil { + return fmt.Errorf("mkdir fail, err: %v", err) + } + } + defaultConfig := &FaultDebugConfig{ + PollInterval: FaultEventCMPollSecInterval, + ReInject: 0, + } + return hdm.updateFaultInjectFile(defaultConfig) +} + +func (hdm *HwDevManager) loadFaultEventFromFile(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("load fault event from file stop") + return + default: + interval := int64(FaultEventCMPollSecInterval) + config := hdm.readAndInjectFaultFromFile() + if config != nil && config.PollInterval > 0 { + interval = config.PollInterval + } + time.Sleep(time.Duration(interval) * time.Second) + } + } +} + +func (hdm *HwDevManager) pollFaultEventFromCm(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("poll fault event from cm stop") + return + default: + interval := int64(FaultEventCMPollSecInterval) + config := hdm.pollAndInjectFaultFromCm() + if config != nil && config.PollInterval > 0 { + interval = config.PollInterval + } + time.Sleep(time.Duration(interval) * time.Second) + } + } +} + +func (hdm *HwDevManager) saveCachedFaultToDP(ctx context.Context) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("save cached fault to dp stop") + return + default: + hdm.injectDevFaultToDp() + time.Sleep(time.Duration(FaultCacheSaveToDPMillInterval) * time.Millisecond) + } + } +} + +func (hdm *HwDevManager) readAndInjectFaultFromFile() *FaultDebugConfig { + config, err := readFaultDebugFileJson() + if err != nil { + hwlog.RunLog.ErrorfWithLimit(FaultEventFileAbsPath, 1, "cannot load fault from '%s' file, reason: %v", FaultEventFileAbsPath, err) + return nil + } + if config.ReInject != ReInjectAllFaultsDefaultValue { + return config + } + + hwlog.RunLog.Infof("ReInject value is '%d' in file, start saving to DP", config.ReInject) + // reset devFaultCache + hdm.updateDevFaultCache(config.Faults) + config.ReInject = 0 + + hdm.updateFaultInjectFile(config) + return config +} + +func (hdm *HwDevManager) pollAndInjectFaultFromCm() *FaultDebugConfig { + + configMap, err := hdm.manager.GetKubeClient().GetConfigMap(FaultEventCMName, FaultEventCMNameSpace) + if err != nil { + hwlog.RunLog.ErrorfWithLimit(FaultEventCMName, 2, "cannot find '%s' configmap, reason: %v", FaultEventCMName, err) + return nil + } + + config, err := parseFaultDebugConfigJson(configMap) + if err != nil || config == nil { + hwlog.RunLog.Error(err) + return nil + } + + if config.ReInject != ReInjectAllFaultsDefaultValue { + return config + } + hwlog.RunLog.Infof("ReInject value is '%d' in CM, start saving to DP", config.ReInject) + + node, err := kubeclient.GetNodeNameFromEnv() + if err != nil || node == "" { + hwlog.RunLog.Errorf("cannot get node from env, reason: %v", err) + return config + } + + if node != config.Node { + hwlog.RunLog.Infof("dont have node '%s' in configmap, target nodes: %s", node, config.Node) + return config + } + + // reset devFaultCache + hdm.updateDevFaultCache(config.Faults) + config.ReInject = 0 + + hdm.updateConfigMap(config, configMap) + + return config +} + +func (hdm *HwDevManager) updateDevFaultCache(faultInfos []FaultInfo) { + tempDevFaultCache := make([]npuCommon.DevFaultInfo, 0) + now := time.Now() + + // save npu device fault + for _, fault := range faultInfos { + eventId, err := convertFaultCodeHexToInt(fault.EventID) + if err != nil { + hwlog.RunLog.Errorf("get fault code fail, reason: %v", err) + continue + } + if len(fault.TimeOffset) == 0 { + fault.TimeOffset = append(fault.TimeOffset, 0) + } + for _, offset := range fault.TimeOffset { + rasedTime := now.Add(time.Duration(offset) * time.Second) + + devFault := npuCommon.DevFaultInfo{ + EventID: eventId, + LogicID: fault.LogicID, + Severity: fault.Severity, + Assertion: fault.Assertion, + AlarmRaisedTime: rasedTime.UnixMilli(), + } + tempDevFaultCache = append(tempDevFaultCache, devFault) + hwlog.RunLog.Infof("add npu fault to dp cache, devFaultInfo: %v, hex code: %v", + devFault, strconv.FormatInt(devFault.EventID, common.Hex)) + } + } + + faultCacheLock.Lock() + hwlog.RunLog.Infof("update cache fault data finished, pre fault cnt: %d, latest fault count: %d", + len(devFaultCache), len(tempDevFaultCache)) + devFaultCache = tempDevFaultCache + faultCacheLock.Unlock() +} + +func (hdm *HwDevManager) injectDevFaultToDp() { + faultCacheLock.Lock() + defer faultCacheLock.Unlock() + + nowTime := time.Now().UnixMilli() + newDevFaultCache := make([]npuCommon.DevFaultInfo, 0) + for _, devFault := range devFaultCache { + if nowTime >= devFault.AlarmRaisedTime { + common.SaveDevFaultInfo(devFault) + continue + } + newDevFaultCache = append(newDevFaultCache, devFault) + } + devFaultCache = newDevFaultCache +} + +func (hdm *HwDevManager) updateConfigMap(config *FaultDebugConfig, configMap *v1.ConfigMap) { + configBytes, err := json.Marshal(*config) + if err != nil { + hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v reason: %v", config, err) + return + } + configMap.Data[FaultEventFileKey] = string(configBytes) + _, err = hdm.manager.GetKubeClient().UpdateConfigMap(configMap) + if err != nil { + hwlog.RunLog.Errorf("update '%s' configmap fail, reason: %v", FaultEventCMName, err) + } +} + +func (hdm *HwDevManager) updateFaultInjectFile(config *FaultDebugConfig) error { + configBytes, err := json.Marshal(*config) + if err != nil { + hwlog.RunLog.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) + return fmt.Errorf("marshal FaultDebugConfig fail, data: %v err: %v", config, err) + } + f, err := os.OpenFile(FaultEventFileAbsPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) + if err != nil { + hwlog.RunLog.Errorf("open fault file failed, reason: %v", err) + return fmt.Errorf("open fault file failed, reason: %v", err) + } + defer f.Close() + if _, err = f.WriteString(string(configBytes)); err != nil { + hwlog.RunLog.Errorf("write fault file failed, reason: %v", err) + return fmt.Errorf("write fault file failed, reason: %v", err) + } + return nil +} + +func parseFaultDebugConfigJson(configMap *v1.ConfigMap) (*FaultDebugConfig, error) { + jsonStr, ok := configMap.Data[FaultEventFileKey] + if !ok { + return nil, fmt.Errorf("cannot find data '%s' in CM'", FaultEventFileKey) + } + return convertByteToFaultDebugConfig([]byte(jsonStr)) +} + +func readFaultDebugFileJson() (*FaultDebugConfig, error) { + faultCodeBytes, err := utils.LoadFile(FaultEventFileAbsPath) + if err != nil { + return nil, fmt.Errorf("load fault event json file failed, path: %v, reason: %v", FaultEventFileAbsPath, err) + } + if faultCodeBytes == nil { + return nil, errors.New("the file does not exist or for other reasons, the read data is empty") + } + return convertByteToFaultDebugConfig(faultCodeBytes) +} + +func convertByteToFaultDebugConfig(bytes []byte) (*FaultDebugConfig, error) { + configInfo := &FaultDebugConfig{ + PollInterval: FaultEventCMPollSecInterval, + } + if err := json.Unmarshal(bytes, configInfo); err != nil { + return nil, fmt.Errorf("cannot unmarshal json data, data: %s, reason: %v", string(bytes), err) + } + return configInfo, nil +} + +func convertFaultCodeHexToInt(hexStr string) (int64, error) { + hexStr = strings.TrimPrefix(hexStr, "0x") + codes := common.StringTool.HexStringToInt([]string{hexStr}) + if len(codes) == 0 { + return -1, fmt.Errorf("convert fault code hex string '%s' to int failed", hexStr) + } + return codes[0], nil +} diff --git a/component/ascend-device-plugin/pkg/server/fault_writer.py b/component/ascend-device-plugin/pkg/server/fault_writer.py new file mode 100644 index 0000000000000000000000000000000000000000..fc5592f623b192b59bd06115b86597d84342d52e --- /dev/null +++ b/component/ascend-device-plugin/pkg/server/fault_writer.py @@ -0,0 +1,55 @@ +import os +import json + +class FaultInfo: + def __init__(self, EventID=None, LogicID=None, Severity=None, Assertion=None, TimeOffset=None): + self.EventID = EventID + self.LogicID = LogicID + self.Severity = Severity + self.Assertion = Assertion + self.TimeOffset = TimeOffset + def to_dict(self): + return { + 'EventID': self.EventID, + 'LogicID': self.LogicID, + 'Severity': self.Severity, + 'Assertion': self.Assertion, + 'TimeOffset': self.TimeOffset + } + +class FaultDebugConfig: + def __init__(self, Node=None, PollInterval=None, ReInject=None, Faults=None): + self.Node = Node + self.PollInterval = PollInterval + self.ReInject = ReInject + self.Faults = Faults + +def create_and_write_json_file(): + file_path = "/user/inject/fault/npuFaultFile.json" + Faults=[ + FaultInfo( + EventID="0x80E21007", + LogicID=1, + Severity=0, + Assertion=1, + TimeOffset=[0, 6] + ), + FaultInfo( + EventID="0x80E21007", + LogicID=1, + Severity=0, + Assertion=0, + TimeOffset=[12] + ) + ] + + json_data = FaultDebugConfig( + Node="XXX", + PollInterval=1, + ReInject=1, + Faults=[fault.to_dict() for fault in Faults] + ) + with open(file_path, 'w') as f: + json.dump(json_data.__dict__, f, indent=4) + +# create_and_write_json_file() \ No newline at end of file diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index e07eccd59720afcaefc40b43700484b4f6e90787..a9f0847210a1b5786f152df0985e3f732a6cfc38 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -28,7 +28,7 @@ import ( "github.com/containerd/containerd" "github.com/fsnotify/fsnotify" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" @@ -398,6 +398,29 @@ func (hdm *HwDevManager) separateNPUIDFromDeviceInfoIntoCache() { } } +func (hdm *HwDevManager) handleDeviceInfoUpdate(initTime *time.Time) { + common.LockAllDeviceInfo() + defer common.UnlockAllDeviceInfo() + + if err := hdm.updateAllInfo(); err != nil { + hwlog.RunLog.Error(err) + return + } + + // complete the fault codes that cannot be reported by the event subscribe interface + hdm.mendSubscribeFaultEvents() + hdm.updateDeviceUsedInfo(hdm.groupDevice) + hdm.notifyToK8s(initTime) + + // if node annotation has reset fail devices but all devices are healthy, clear node annotation + hdm.checkNodeResetInfo() + hdm.useVolcanoNotify() + hdm.chipHotReset() + common.DelOnceRecoverFault(hdm.groupDevice) + common.DelOnceFrequencyFault() + common.Synchronize = true +} + // ListenDevice ListenDevice coroutine func (hdm *HwDevManager) ListenDevice(ctx context.Context) { hwlog.RunLog.Info("starting the listen device") @@ -411,10 +434,25 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { hdm.separateNPUIDFromDeviceInfoIntoCache() go hdm.pollFaultCodeCM(ctx) go hdm.Serve(ctx) + + hdm.constructNpuFaultByCm(ctx) + if common.ParamOption.CheckCachedPods { go hdm.manager.GetKubeClient().PodInformerInspector(ctx) } + + if client := hdm.manager.GetKubeClient(); client != nil { + client.StartPodCreationWatcher() + } else { + hwlog.RunLog.Errorf("kube client is nil") + } + initTime := time.Now() + ticker := time.NewTicker(time.Duration(common.ParamOption.ListAndWatchPeriod) * time.Second) + defer ticker.Stop() + triggerTicker := time.NewTicker(time.Second) + defer triggerTicker.Stop() + for { select { case _, ok := <-ctx.Done(): @@ -423,33 +461,25 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { } hwlog.RunLog.Info("listen device stop") return - default: - time.Sleep(time.Duration(common.ParamOption.ListAndWatchPeriod) * time.Second) - common.LockAllDeviceInfo() - if err := hdm.updateAllInfo(); err != nil { - hwlog.RunLog.Error(err) - common.UnlockAllDeviceInfo() - continue - } - // complete the fault codes that cannot be reported by the event subscribe interface - hdm.mendSubscribeFaultEvents() - - hdm.updateDeviceUsedInfo(hdm.groupDevice) - hdm.notifyToK8s(&initTime) - - // if node annotation has reset fail devices but all devices are healthy, clear node annotation - hdm.checkNodeResetInfo() - - hdm.useVolcanoNotify() - hdm.chipHotReset() - common.DelOnceRecoverFault(hdm.groupDevice) - common.DelOnceFrequencyFault() - common.UnlockAllDeviceInfo() - common.Synchronize = true + case <-triggerTicker.C: // 每秒触发一次 + hdm.parseTriggers(initTime) + case <-ticker.C: + hwlog.RunLog.Debug("Periodic device info update") + hdm.handleDeviceInfoUpdate(&initTime) } } } +func (hdm *HwDevManager) parseTriggers(initTime time.Time) { + select { + case <-common.UpdateTriggerChan: + hwlog.RunLog.Info("Received fault trigger, processing device info update") + hdm.handleDeviceInfoUpdate(&initTime) + default: + hwlog.RunLog.Debug("No fault trigger, skipping device info update") + } +} + func deepCopyGroupDevice(groupDevice map[string][]*common.NpuDevice) map[string][]*common.NpuDevice { newGroupDevice := make(map[string][]*common.NpuDevice, len(groupDevice)) for deviceType, npuDevices := range groupDevice { diff --git a/component/noded/main.go b/component/noded/main.go index 9a635f8f33680976634fc7ba4f6dfefd910fa48a..73fcb8de7b4eb7583f30dbbcefd17c6fec70f834 100644 --- a/component/noded/main.go +++ b/component/noded/main.go @@ -35,7 +35,7 @@ import ( const ( defaultLogFile = "/var/log/mindx-dl/noded/noded.log" // defaultHeatBeatInterval is the default report interval - defaultReportInterval = 5 + defaultReportInterval = 1 // defaultMonitorPeriod is the default plugin monitor period defaultMonitorPeriod = 60 // maxReportInterval is the max report interval diff --git a/component/noded/pkg/reporter/cmreporter/configmap_reporter.go b/component/noded/pkg/reporter/cmreporter/configmap_reporter.go index 0daa64423c67daa555594d6a835bd9a42045e266..ea9340d628879a347ec2a03fafa994ed32a85f7a 100644 --- a/component/noded/pkg/reporter/cmreporter/configmap_reporter.go +++ b/component/noded/pkg/reporter/cmreporter/configmap_reporter.go @@ -19,7 +19,7 @@ import ( "encoding/json" "time" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "ascend-common/api"