diff --git a/component/ascend-device-plugin/pkg/device/ascend310.go b/component/ascend-device-plugin/pkg/device/ascend310.go index 60c60310f37b11a114d67488f2a2a443aa5e6e13..34218a0591c08ddf3f2b0ad6868bc1ff734f71d7 100644 --- a/component/ascend-device-plugin/pkg/device/ascend310.go +++ b/component/ascend-device-plugin/pkg/device/ascend310.go @@ -58,7 +58,7 @@ func (hnm *HwAscend310Manager) GetNPUs() (common.NpuAllInfo, error) { } var allDevices = make([]common.NpuDevice, 0) for logicIDIdx := 0; logicIDIdx < len(devList); logicIDIdx++ { - davinCiDev, err := hnm.getDavinCiDev(devList[logicIDIdx]) + davinCiDev, err := hnm.GetDavinCiDev(devList[logicIDIdx]) if err != nil { return common.NpuAllInfo{}, err } diff --git a/component/ascend-device-plugin/pkg/device/ascend310p.go b/component/ascend-device-plugin/pkg/device/ascend310p.go index d01e15c0556e5a2af9a14cfe99365cf9fc401950..cd4431389857fc644009ea20ed01c11d41854bf3 100644 --- a/component/ascend-device-plugin/pkg/device/ascend310p.go +++ b/component/ascend-device-plugin/pkg/device/ascend310p.go @@ -56,7 +56,7 @@ func (hnm *HwAscend310PManager) GetNPUs() (common.NpuAllInfo, error) { var aiCoreDevices []*common.NpuDevice var allDeviceTypes = make([]string, 0) for i := int32(0); i < devNum; i++ { - davinCiDev, err := hnm.getDavinCiDev(devList[i]) + davinCiDev, err := hnm.GetDavinCiDev(devList[i]) if err != nil { return common.NpuAllInfo{}, err } diff --git a/component/ascend-device-plugin/pkg/device/ascend910.go b/component/ascend-device-plugin/pkg/device/ascend910.go index fd56bd0640122108f40d7bebe85e31703cd66f4c..ee0c43625f99a945f2ad6b046736d2c63c527fcd 100644 --- a/component/ascend-device-plugin/pkg/device/ascend910.go +++ b/component/ascend-device-plugin/pkg/device/ascend910.go @@ -99,7 +99,7 @@ func (hnm *HwAscend910Manager) GetNPUs() (common.NpuAllInfo, error) { var aiCoreDevices []*common.NpuDevice var allDeviceTypes = make([]string, 0) for i := int32(0); i < devNum; i++ { - davinCiDev, err := hnm.getDavinCiDev(devList[i]) + davinCiDev, err := hnm.GetDavinCiDev(devList[i]) if err != nil { return common.NpuAllInfo{}, err } @@ -120,6 +120,7 @@ func (hnm *HwAscend910Manager) GetNPUs() (common.NpuAllInfo, error) { hnm.assembleVirtualDevices(davinCiDev, vDevInfos, &allDevices, &allDeviceTypes) } allDeviceTypes = hnm.removeDuplicate(&allDeviceTypes) + hwlog.RunLog.Warnf("get npus got all info:%#v", common.NpuAllInfo{AllDevs: allDevices, AICoreDevs: aiCoreDevices, AllDevTypes: allDeviceTypes}) return common.NpuAllInfo{AllDevs: allDevices, AICoreDevs: aiCoreDevices, AllDevTypes: allDeviceTypes}, nil } diff --git a/component/ascend-device-plugin/pkg/device/ascend_used_chips.go b/component/ascend-device-plugin/pkg/device/ascend_used_chips.go index fc2d43a593ea89f691adf5dfe471a4f2727fc0aa..4ae661ac8ed5824a289866382f148208227d5b05 100644 --- a/component/ascend-device-plugin/pkg/device/ascend_used_chips.go +++ b/component/ascend-device-plugin/pkg/device/ascend_used_chips.go @@ -61,12 +61,12 @@ func (tool *AscendTools) getChipsUsedByProcess() sets.String { } if chipInfo.ProcNum != 0 { hwlog.RunLog.Debugf("the card logicID:[%d] is used, chipInfo: %#v", logicID, chipInfo) - davinCidev, err := tool.getDavinCiDev(logicID) + phyID, err := tool.dmgr.GetPhysicIDFromLogicID(logicID) if err != nil { hwlog.RunLog.Errorf("get davinci dev by logicID:[%d] failed, err: %v", logicID, err) continue } - chipName := fmt.Sprintf("%s-%d", tool.name, davinCidev.PhyID) + chipName := fmt.Sprintf("%s-%d", tool.name, phyID) usedChips = append(usedChips, chipName) } } diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon.go b/component/ascend-device-plugin/pkg/device/ascendcommon.go index 18dfd4b3afad385687baa901c0c75f9d8a3f85ed..8ead3c3c3a8846928dc32e90fd79059e15382e74 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon.go @@ -89,6 +89,7 @@ type AscendTools struct { // DevManager interface for manager device type DevManager interface { + GetDavinCiDev(logicID int32) (common.DavinCiDev, error) GetNPUs() (common.NpuAllInfo, error) DoWithVolcanoListAndWatch(map[string][]*common.NpuDevice) GraceTolerance(map[string][]*common.NpuDevice) @@ -624,7 +625,7 @@ func (tool *AscendTools) removeDuplicateErr(faultCodes []int64) []int64 { return newCode } -func (tool *AscendTools) getDavinCiDev(logicID int32) (common.DavinCiDev, error) { +func (tool *AscendTools) GetDavinCiDev(logicID int32) (common.DavinCiDev, error) { phyID, err := tool.dmgr.GetPhysicIDFromLogicID(logicID) if err != nil { return common.DavinCiDev{}, err diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 8120afa3cabf9b9c6b639d7c67c473a3d7c12449..46db574e82ff7b6aad8a34af3742de7fb4eca428 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -141,6 +141,40 @@ func (hdm *HwDevManager) UpdateNode() error { hdm.manager.GetKubeClient().InitPodInformer() hwlog.RunLog.Info("init kube client success") + go func() { + for { + time.Sleep(time.Minute) + var mashaledNpuInfoStr string + allInfo, err := hdm.manager.GetNPUs() + if err != nil { + continue + } + allValid := true + for _, npuInfo := range allInfo.AllDevs { + allValid = allValid && npuInfo.IP != "" + } + ipMap := make(map[string]*common.NpuBaseInfo, len(allInfo.AllDevs)) + for _, dev := range allInfo.AllDevs { + ipMap[dev.DeviceName] = &common.NpuBaseInfo{ + IP: dev.IP, + SuperDeviceID: dev.SuperDeviceID, + } + } + mashaledNpuInfo, err := json.Marshal(ipMap) + if err != nil { + hwlog.RunLog.Errorf("failed to marshal device ip map, err: %v", err) + } + if allValid && string(mashaledNpuInfo) == mashaledNpuInfoStr { + continue + } + err = hdm.updateNode() + if err != nil { + continue + } + mashaledNpuInfoStr = string(mashaledNpuInfo) + } + }() + return hdm.updateNode() } @@ -168,6 +202,7 @@ func (hdm *HwDevManager) updateNode() error { return err } newNode.Annotations[api.BaseDevInfoAnno] = string(mashaledNpuInfo) + hwlog.RunLog.Warnf("before writing annotation:%v", string(mashaledNpuInfo)) newNode.Annotations[common.SuperPodIDKey] = strconv.Itoa(int(hdm.getSuperPodInfo().SuperPodId)) for i := 0; i < common.RetryUpdateCount; i++ { if _, _, err = hdm.manager.GetKubeClient().PatchNodeState(oldNode, newNode); err == nil {