From 5e6abbf1d7bbc77ff716fdf4d931b80c9cda90fb Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Wed, 23 Apr 2025 10:45:55 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E8=BD=AE=E8=AF=A2ip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/manager.go | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 8120afa3c..a99b92d6e 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -44,6 +44,8 @@ import ( ) var resourceVersion = "" +var lastUpdateIpTime time.Time +var currentBaseDeviceInfo = "" const memoryRadix = 1024 @@ -421,6 +423,28 @@ func (hdm *HwDevManager) handleDeviceInfoUpdate(initTime *time.Time) { common.Synchronize = true } +func (hdm *HwDevManager) updateNodeAnnotation() { + if time.Now().Sub(lastUpdateIpTime) > time.Minute { + mashaledNpuInfo, err := json.Marshal(hdm.getNpuBaseInfo()) + if err != nil { + hwlog.RunLog.Errorf("failed to marshal info, err: %v", err) + return + } + if string(mashaledNpuInfo) == currentBaseDeviceInfo { + hwlog.RunLog.Infof("there is no changes will not do anything") + return + } + err = hdm.updateNode() + if err != nil { + hwlog.RunLog.Errorf("failed to update node annotation, err: %v", err) + return + } + hwlog.RunLog.Infof("node annotation has been updated") + lastUpdateIpTime = time.Now() + currentBaseDeviceInfo = string(mashaledNpuInfo) + } +} + // ListenDevice ListenDevice coroutine func (hdm *HwDevManager) ListenDevice(ctx context.Context) { hwlog.RunLog.Info("starting the listen device") -- Gitee From ff490968dfcb84eceb5681629f8411e6761959b8 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Wed, 23 Apr 2025 10:50:48 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E8=BD=AE=E8=AF=A2ip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/ascend-device-plugin/pkg/server/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index a99b92d6e..5b2d41e0a 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -408,7 +408,7 @@ func (hdm *HwDevManager) handleDeviceInfoUpdate(initTime *time.Time) { hwlog.RunLog.Error(err) return } - + hdm.updateNodeAnnotation() // complete the fault codes that cannot be reported by the event subscribe interface hdm.mendSubscribeFaultEvents() hdm.updateDeviceUsedInfo(hdm.groupDevice) -- Gitee From 85a945aa248c5904cc7f4d046e19f74c6d740db4 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Thu, 24 Apr 2025 09:40:50 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E8=BD=AE=E8=AF=A2ip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/common/constants.go | 7 ++++++ .../pkg/device/ascend_used_chips.go | 4 ++-- .../pkg/device/ascendcommon.go | 22 ++++++++++++++++++- .../pkg/server/manager.go | 4 +++- 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/component/ascend-device-plugin/pkg/common/constants.go b/component/ascend-device-plugin/pkg/common/constants.go index 8447a7f91..2dca2c39f 100644 --- a/component/ascend-device-plugin/pkg/common/constants.go +++ b/component/ascend-device-plugin/pkg/common/constants.go @@ -891,3 +891,10 @@ const ( // MaxPodEventRetryTimes max try time for pod add event while cache none MaxPodEventRetryTimes = 4 ) + +const ( + // GetIpRetryTimes the upper limit of get device ip + GetIpRetryTimes = 18 + // GetIpRetryInterval the interval between each retry in seconds + GetIpRetryInterval = 10 +) diff --git a/component/ascend-device-plugin/pkg/device/ascend_used_chips.go b/component/ascend-device-plugin/pkg/device/ascend_used_chips.go index fc2d43a59..4ae661ac8 100644 --- a/component/ascend-device-plugin/pkg/device/ascend_used_chips.go +++ b/component/ascend-device-plugin/pkg/device/ascend_used_chips.go @@ -61,12 +61,12 @@ func (tool *AscendTools) getChipsUsedByProcess() sets.String { } if chipInfo.ProcNum != 0 { hwlog.RunLog.Debugf("the card logicID:[%d] is used, chipInfo: %#v", logicID, chipInfo) - davinCidev, err := tool.getDavinCiDev(logicID) + phyID, err := tool.dmgr.GetPhysicIDFromLogicID(logicID) if err != nil { hwlog.RunLog.Errorf("get davinci dev by logicID:[%d] failed, err: %v", logicID, err) continue } - chipName := fmt.Sprintf("%s-%d", tool.name, davinCidev.PhyID) + chipName := fmt.Sprintf("%s-%d", tool.name, phyID) usedChips = append(usedChips, chipName) } } diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon.go b/component/ascend-device-plugin/pkg/device/ascendcommon.go index 18dfd4b3a..a90643638 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon.go @@ -50,6 +50,10 @@ var ( lastCheckNodeLabel int64 useIpv4 = true re = regexp.MustCompile(`"fault_time":\d+,`) + // counter to record all card get ip times + counter = 0 + // FirstStartup to indicate whether it is the first time + FirstStartup = true ) const ( @@ -633,7 +637,23 @@ func (tool *AscendTools) getDavinCiDev(logicID int32) (common.DavinCiDev, error) if err != nil { return common.DavinCiDev{}, err } - ip, err := tool.getDeviceIP("", int(phyID)) + ip := "" + ip, err = tool.getDeviceIP("", int(phyID)) + devType := tool.dmgr.GetDevType() + needRetry := false + if devType == common.Ascend910 || devType == common.Ascend910B || devType == common.Ascend910A3 { + needRetry = true + } + // to avoid os system first startup, some card may not have ip on it + for counter < common.GetIpRetryTimes && err != nil && FirstStartup && needRetry { + time.Sleep(common.GetIpRetryInterval * time.Second) + ip, err = tool.getDeviceIP("", int(phyID)) + if err == nil { + break + } + hwlog.RunLog.Warnf("get device ip failed, err: %v, will retry", err) + counter++ + } if err != nil { hwlog.RunLog.Warnf("get device ip failed, err: %v", err) ip = "" diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 5b2d41e0a..dc9ad6d35 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -62,6 +62,9 @@ type HwDevManager struct { // NewHwDevManager function is used to new a dev manager. func NewHwDevManager(devM devmanager.DeviceInterface) *HwDevManager { + defer func() { + device.FirstStartup = false + }() var hdm HwDevManager if err := hdm.setAscendManager(devM); err != nil { hwlog.RunLog.Errorf("init hw dev manager failed, err: %v", err) @@ -246,7 +249,6 @@ func (hdm *HwDevManager) setAllDeviceAndType() error { return err } hdm.manager.SetKubeClient(kubeClient) - if hdm.allInfo, err = hdm.manager.GetNPUs(); err != nil { return err } -- Gitee From 1f798db8d156df1950cbdbcd9482d62b206be7cd Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Thu, 24 Apr 2025 09:57:10 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=E8=BD=AE=E8=AF=A2ip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascend-device-plugin/pkg/device/ascend910.go | 1 + .../pkg/device/ascendcommon.go | 15 ++++++++++++++- .../ascend-device-plugin/pkg/server/manager.go | 4 +--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/component/ascend-device-plugin/pkg/device/ascend910.go b/component/ascend-device-plugin/pkg/device/ascend910.go index fd56bd064..dbf713d00 100644 --- a/component/ascend-device-plugin/pkg/device/ascend910.go +++ b/component/ascend-device-plugin/pkg/device/ascend910.go @@ -70,6 +70,7 @@ type HwAscend910Manager struct { func NewHwAscend910Manager() *HwAscend910Manager { return &HwAscend910Manager{ AscendTools: AscendTools{ + firstStartUp: true, name: common.Ascend910, unHealthyKey: common.HuaweiUnHealthAscend910, devCount: common.MaxDevicesNum, diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon.go b/component/ascend-device-plugin/pkg/device/ascendcommon.go index a90643638..6ef840327 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon.go @@ -70,6 +70,7 @@ type AscendTools struct { client *kubeclient.ClientK8s containerdClient *containerd.Client dmgr devmanager.DeviceInterface + firstStartUp bool name string deviceUsage string unHealthyKey string @@ -99,6 +100,8 @@ type DevManager interface { SetDmgr(devmanager.DeviceInterface) GetDmgr() devmanager.DeviceInterface GetChipAICore() int32 + GetFirstStartUp() bool + SetFirstStartUp(bool) GetName() string SetKubeClient(*kubeclient.ClientK8s) GetKubeClient() *kubeclient.ClientK8s @@ -171,6 +174,16 @@ func (tool *AscendTools) GetName() string { return tool.name } +// GetFirstStartUp get param first start up +func (tool *AscendTools) GetFirstStartUp() bool { + return tool.firstStartUp +} + +// SetFirstStartUp set param first start up +func (tool *AscendTools) SetFirstStartUp(first bool) { + tool.firstStartUp = first +} + func (tool *AscendTools) convertLogicIDsToDeviceNames(logicIds []int32) string { deviceRunMode, err := common.GetDeviceRunMode() if err != nil { @@ -645,7 +658,7 @@ func (tool *AscendTools) getDavinCiDev(logicID int32) (common.DavinCiDev, error) needRetry = true } // to avoid os system first startup, some card may not have ip on it - for counter < common.GetIpRetryTimes && err != nil && FirstStartup && needRetry { + for counter < common.GetIpRetryTimes && err != nil && tool.firstStartUp && needRetry { time.Sleep(common.GetIpRetryInterval * time.Second) ip, err = tool.getDeviceIP("", int(phyID)) if err == nil { diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index dc9ad6d35..707a89938 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -62,9 +62,6 @@ type HwDevManager struct { // NewHwDevManager function is used to new a dev manager. func NewHwDevManager(devM devmanager.DeviceInterface) *HwDevManager { - defer func() { - device.FirstStartup = false - }() var hdm HwDevManager if err := hdm.setAscendManager(devM); err != nil { hwlog.RunLog.Errorf("init hw dev manager failed, err: %v", err) @@ -74,6 +71,7 @@ func NewHwDevManager(devM devmanager.DeviceInterface) *HwDevManager { hwlog.RunLog.Errorf("set all device and type failed, err: %v", err) return nil } + hdm.manager.SetFirstStartUp(false) device.InitResetInfoMgr(hdm.manager.GetKubeClient()) if err := hdm.setContainerdClient(); err != nil { hwlog.RunLog.Warnf("set containerd client failed, "+ -- Gitee From c62932da7a8a2bd6e3f8dae9ab2e7581feb9eea8 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Thu, 24 Apr 2025 10:02:15 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E8=BD=AE=E8=AF=A2ip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/ascend-device-plugin/pkg/device/ascendcommon.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon.go b/component/ascend-device-plugin/pkg/device/ascendcommon.go index 6ef840327..f117b4993 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon.go @@ -52,8 +52,6 @@ var ( re = regexp.MustCompile(`"fault_time":\d+,`) // counter to record all card get ip times counter = 0 - // FirstStartup to indicate whether it is the first time - FirstStartup = true ) const ( -- Gitee