From 8f4f5edb48181047ec24e041415580971613289a Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Tue, 22 Apr 2025 16:06:06 +0800 Subject: [PATCH] =?UTF-8?q?dp=E8=BD=AE=E8=AF=A2ip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/device/ascend310.go | 2 +- .../pkg/device/ascend310p.go | 2 +- .../pkg/device/ascend910.go | 3 +- .../pkg/device/ascend_used_chips.go | 4 +-- .../pkg/device/ascendcommon.go | 3 +- .../pkg/server/manager.go | 35 +++++++++++++++++++ 6 files changed, 43 insertions(+), 6 deletions(-) diff --git a/component/ascend-device-plugin/pkg/device/ascend310.go b/component/ascend-device-plugin/pkg/device/ascend310.go index 60c60310f..34218a059 100644 --- a/component/ascend-device-plugin/pkg/device/ascend310.go +++ b/component/ascend-device-plugin/pkg/device/ascend310.go @@ -58,7 +58,7 @@ func (hnm *HwAscend310Manager) GetNPUs() (common.NpuAllInfo, error) { } var allDevices = make([]common.NpuDevice, 0) for logicIDIdx := 0; logicIDIdx < len(devList); logicIDIdx++ { - davinCiDev, err := hnm.getDavinCiDev(devList[logicIDIdx]) + davinCiDev, err := hnm.GetDavinCiDev(devList[logicIDIdx]) if err != nil { return common.NpuAllInfo{}, err } diff --git a/component/ascend-device-plugin/pkg/device/ascend310p.go b/component/ascend-device-plugin/pkg/device/ascend310p.go index d01e15c05..cd4431389 100644 --- a/component/ascend-device-plugin/pkg/device/ascend310p.go +++ b/component/ascend-device-plugin/pkg/device/ascend310p.go @@ -56,7 +56,7 @@ func (hnm *HwAscend310PManager) GetNPUs() (common.NpuAllInfo, error) { var aiCoreDevices []*common.NpuDevice var allDeviceTypes = make([]string, 0) for i := int32(0); i < devNum; i++ { - davinCiDev, err := hnm.getDavinCiDev(devList[i]) + davinCiDev, err := hnm.GetDavinCiDev(devList[i]) if err != nil { return common.NpuAllInfo{}, err } diff --git a/component/ascend-device-plugin/pkg/device/ascend910.go b/component/ascend-device-plugin/pkg/device/ascend910.go index fd56bd064..ee0c43625 100644 --- a/component/ascend-device-plugin/pkg/device/ascend910.go +++ b/component/ascend-device-plugin/pkg/device/ascend910.go @@ -99,7 +99,7 @@ func (hnm *HwAscend910Manager) GetNPUs() (common.NpuAllInfo, error) { var aiCoreDevices []*common.NpuDevice var allDeviceTypes = make([]string, 0) for i := int32(0); i < devNum; i++ { - davinCiDev, err := hnm.getDavinCiDev(devList[i]) + davinCiDev, err := hnm.GetDavinCiDev(devList[i]) if err != nil { return common.NpuAllInfo{}, err } @@ -120,6 +120,7 @@ func (hnm *HwAscend910Manager) GetNPUs() (common.NpuAllInfo, error) { hnm.assembleVirtualDevices(davinCiDev, vDevInfos, &allDevices, &allDeviceTypes) } allDeviceTypes = hnm.removeDuplicate(&allDeviceTypes) + hwlog.RunLog.Warnf("get npus got all info:%#v", common.NpuAllInfo{AllDevs: allDevices, AICoreDevs: aiCoreDevices, AllDevTypes: allDeviceTypes}) return common.NpuAllInfo{AllDevs: allDevices, AICoreDevs: aiCoreDevices, AllDevTypes: allDeviceTypes}, nil } diff --git a/component/ascend-device-plugin/pkg/device/ascend_used_chips.go b/component/ascend-device-plugin/pkg/device/ascend_used_chips.go index fc2d43a59..4ae661ac8 100644 --- a/component/ascend-device-plugin/pkg/device/ascend_used_chips.go +++ b/component/ascend-device-plugin/pkg/device/ascend_used_chips.go @@ -61,12 +61,12 @@ func (tool *AscendTools) getChipsUsedByProcess() sets.String { } if chipInfo.ProcNum != 0 { hwlog.RunLog.Debugf("the card logicID:[%d] is used, chipInfo: %#v", logicID, chipInfo) - davinCidev, err := tool.getDavinCiDev(logicID) + phyID, err := tool.dmgr.GetPhysicIDFromLogicID(logicID) if err != nil { hwlog.RunLog.Errorf("get davinci dev by logicID:[%d] failed, err: %v", logicID, err) continue } - chipName := fmt.Sprintf("%s-%d", tool.name, davinCidev.PhyID) + chipName := fmt.Sprintf("%s-%d", tool.name, phyID) usedChips = append(usedChips, chipName) } } diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon.go b/component/ascend-device-plugin/pkg/device/ascendcommon.go index 18dfd4b3a..8ead3c3c3 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon.go @@ -89,6 +89,7 @@ type AscendTools struct { // DevManager interface for manager device type DevManager interface { + GetDavinCiDev(logicID int32) (common.DavinCiDev, error) GetNPUs() (common.NpuAllInfo, error) DoWithVolcanoListAndWatch(map[string][]*common.NpuDevice) GraceTolerance(map[string][]*common.NpuDevice) @@ -624,7 +625,7 @@ func (tool *AscendTools) removeDuplicateErr(faultCodes []int64) []int64 { return newCode } -func (tool *AscendTools) getDavinCiDev(logicID int32) (common.DavinCiDev, error) { +func (tool *AscendTools) GetDavinCiDev(logicID int32) (common.DavinCiDev, error) { phyID, err := tool.dmgr.GetPhysicIDFromLogicID(logicID) if err != nil { return common.DavinCiDev{}, err diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 8120afa3c..46db574e8 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -141,6 +141,40 @@ func (hdm *HwDevManager) UpdateNode() error { hdm.manager.GetKubeClient().InitPodInformer() hwlog.RunLog.Info("init kube client success") + go func() { + for { + time.Sleep(time.Minute) + var mashaledNpuInfoStr string + allInfo, err := hdm.manager.GetNPUs() + if err != nil { + continue + } + allValid := true + for _, npuInfo := range allInfo.AllDevs { + allValid = allValid && npuInfo.IP != "" + } + ipMap := make(map[string]*common.NpuBaseInfo, len(allInfo.AllDevs)) + for _, dev := range allInfo.AllDevs { + ipMap[dev.DeviceName] = &common.NpuBaseInfo{ + IP: dev.IP, + SuperDeviceID: dev.SuperDeviceID, + } + } + mashaledNpuInfo, err := json.Marshal(ipMap) + if err != nil { + hwlog.RunLog.Errorf("failed to marshal device ip map, err: %v", err) + } + if allValid && string(mashaledNpuInfo) == mashaledNpuInfoStr { + continue + } + err = hdm.updateNode() + if err != nil { + continue + } + mashaledNpuInfoStr = string(mashaledNpuInfo) + } + }() + return hdm.updateNode() } @@ -168,6 +202,7 @@ func (hdm *HwDevManager) updateNode() error { return err } newNode.Annotations[api.BaseDevInfoAnno] = string(mashaledNpuInfo) + hwlog.RunLog.Warnf("before writing annotation:%v", string(mashaledNpuInfo)) newNode.Annotations[common.SuperPodIDKey] = strconv.Itoa(int(hdm.getSuperPodInfo().SuperPodId)) for i := 0; i < common.RetryUpdateCount; i++ { if _, _, err = hdm.manager.GetKubeClient().PatchNodeState(oldNode, newNode); err == nil { -- Gitee