From f1da0404066017f5fdc052011afb1ee60d4c776b Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Tue, 19 Aug 2025 15:24:44 +0800 Subject: [PATCH 01/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/device/ascend310.go | 5 ++ .../pkg/device/ascend310p.go | 5 ++ .../pkg/device/ascend910.go | 12 ++--- .../pkg/device/ascend910_test.go | 32 ++++++------ .../pkg/device/ascendcommon.go | 1 + .../pkg/server/manager.go | 51 ++++++++++++++++++- 6 files changed, 83 insertions(+), 23 deletions(-) diff --git a/component/ascend-device-plugin/pkg/device/ascend310.go b/component/ascend-device-plugin/pkg/device/ascend310.go index b3f59fbd2..d9b028509 100644 --- a/component/ascend-device-plugin/pkg/device/ascend310.go +++ b/component/ascend-device-plugin/pkg/device/ascend310.go @@ -105,3 +105,8 @@ func (hnm *HwAscend310Manager) updateDeviceInfo(_, newDeviceInfo map[string]stri func (hnm *HwAscend310Manager) GraceTolerance(context.Context, map[string][]*common.NpuDevice) { return } + +// GetAssociatedLogicIDs get associated logic id list, not supported currently +func (hnm *HwAscend310Manager) GetAssociatedLogicIDs(logicID, cardID, deviceID int32) ([]int32, error) { + return nil, nil +} diff --git a/component/ascend-device-plugin/pkg/device/ascend310p.go b/component/ascend-device-plugin/pkg/device/ascend310p.go index 74300c9cf..7ad84744d 100644 --- a/component/ascend-device-plugin/pkg/device/ascend310p.go +++ b/component/ascend-device-plugin/pkg/device/ascend310p.go @@ -123,3 +123,8 @@ func (hnm *HwAscend310PManager) updateDeviceInfo(_, newDevInfo map[string]string func (hnm *HwAscend310PManager) GraceTolerance(context.Context, map[string][]*common.NpuDevice) { return } + +// GetAssociatedLogicIDs get associated logic id list, not supported currently +func (hnm *HwAscend310PManager) GetAssociatedLogicIDs(logicID, cardID, deviceID int32) ([]int32, error) { + return nil, nil +} diff --git a/component/ascend-device-plugin/pkg/device/ascend910.go b/component/ascend-device-plugin/pkg/device/ascend910.go index 8ef813d96..e4b55fc8d 100644 --- a/component/ascend-device-plugin/pkg/device/ascend910.go +++ b/component/ascend-device-plugin/pkg/device/ascend910.go @@ -357,7 +357,7 @@ func (hnm *HwAscend910Manager) setUnhealthyForA3(devStatusList []*common.NpuDevi return fmt.Errorf("invalid in reset dev id %v", inResetDev) } dev := devStatusList[inResetDev] - logicIdArr, err := hnm.getAssociatedLogicIDs(dev.LogicID, dev.CardID, dev.DeviceID) + logicIdArr, err := hnm.GetAssociatedLogicIDs(dev.LogicID, dev.CardID, dev.DeviceID) if err != nil { return err } @@ -374,7 +374,7 @@ func (hnm *HwAscend910Manager) setUnhealthyForA3(devStatusList []*common.NpuDevi return nil } -func (hnm *HwAscend910Manager) getAssociatedLogicIDs(logicID, cardID, deviceID int32) ([]int32, error) { +func (hnm *HwAscend910Manager) GetAssociatedLogicIDs(logicID, cardID, deviceID int32) ([]int32, error) { associatedCardID, err := hnm.GetDmgr().GetBrotherCardID(cardID, deviceID) if err != nil { hwlog.RunLog.Debugf("get brother card failed, cardID %v deviceID %v, err: %v", @@ -569,7 +569,7 @@ func (hnm *HwAscend910Manager) canA3BeReset(dev *common.DevFaultInfo) bool { hwlog.RunLog.Errorf("get cardID deviceID by logicID %v faild: %v", dev.LogicId, err) return false } - logicIdArr, err := hnm.getAssociatedLogicIDs(dev.LogicId, cardID, deviceID) + logicIdArr, err := hnm.GetAssociatedLogicIDs(dev.LogicId, cardID, deviceID) if err != nil { return false } @@ -1056,7 +1056,7 @@ func (hnm *HwAscend910Manager) filterDevStatusForA3(devStatusList []*common.NpuD if _, exist := devToBeSet[dev.LogicID]; exist { continue } - logicIdArr, err := hnm.getAssociatedLogicIDs(dev.LogicID, dev.CardID, dev.DeviceID) + logicIdArr, err := hnm.GetAssociatedLogicIDs(dev.LogicID, dev.CardID, dev.DeviceID) if err != nil { return err } @@ -1546,7 +1546,7 @@ func (hnm *HwAscend910Manager) getA3LogicMapByAssociation(devFaultInfoList []*co hwlog.RunLog.Errorf("get cardID deviceID by logicID %v failed: %v", devFault.LogicId, err) return nil, err } - logicIDs, err := hnm.getAssociatedLogicIDs(devFault.LogicId, cardID, deviceID) + logicIDs, err := hnm.GetAssociatedLogicIDs(devFault.LogicId, cardID, deviceID) if err != nil { return nil, err } @@ -1760,7 +1760,7 @@ func (hnm *HwAscend910Manager) getResetIndexForA3(logicID int32) (int32, error) hwlog.RunLog.Errorf("get cardID deviceID by logicID %v failed: %v", logicID, err) return errorId, err } - logicIDs, err := hnm.getAssociatedLogicIDs(logicID, cardID, deviceID) + logicIDs, err := hnm.GetAssociatedLogicIDs(logicID, cardID, deviceID) if err != nil { return errorId, err } diff --git a/component/ascend-device-plugin/pkg/device/ascend910_test.go b/component/ascend-device-plugin/pkg/device/ascend910_test.go index dbf1c4d3b..4d35f569e 100644 --- a/component/ascend-device-plugin/pkg/device/ascend910_test.go +++ b/component/ascend-device-plugin/pkg/device/ascend910_test.go @@ -354,7 +354,7 @@ func TestCanA3BeReset(t *testing.T) { int32(id1), int32(id1), nil) defer patch.Reset() convey.Convey("02-get associated card error, should return false", func() { - patch1 := gomonkey.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch1 := gomonkey.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return nil, testErr }) @@ -362,7 +362,7 @@ func TestCanA3BeReset(t *testing.T) { ret := manager.canA3BeReset(dev) convey.So(ret, convey.ShouldBeFalse) }) - patch.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return []int32{id1}, nil }) @@ -398,7 +398,7 @@ func TestCanA3BeResetPatch1(t *testing.T) { patch := gomonkey.ApplyMethodReturn(&devmanager.DeviceManagerMock{}, "GetCardIDDeviceID", int32(id1), int32(id1), nil) defer patch.Reset() - patch.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return []int32{id1}, nil }) @@ -543,7 +543,7 @@ func TestSetUnhealthyForA3(t *testing.T) { inResetDev = int32(id1) convey.Convey("02-get associated card error, should return error", func() { patch1 := gomonkey.ApplyFuncReturn(IsDevBusy, true) - patch1.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch1.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return nil, testErr }) @@ -552,7 +552,7 @@ func TestSetUnhealthyForA3(t *testing.T) { convey.So(err, convey.ShouldBeError) }) convey.Convey("03-success, should return nil", func() { - patch1 := gomonkey.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch1 := gomonkey.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return []int32{logicID}, nil }) @@ -563,16 +563,16 @@ func TestSetUnhealthyForA3(t *testing.T) { }) } -// TestGetAssociatedLogicIDs test the function getAssociatedLogicIDs +// TestGetAssociatedLogicIDs test the function GetAssociatedLogicIDs func TestGetAssociatedLogicIDs(t *testing.T) { manager := createFake910Manager() const id1int32 = int32(id1) - convey.Convey("test getAssociatedLogicIDs", t, func() { + convey.Convey("test GetAssociatedLogicIDs", t, func() { convey.Convey("01-get brother card error, should return error", func() { patch1 := gomonkey.ApplyMethodReturn(&devmanager.DeviceManagerMock{}, "GetBrotherCardID", id1int32, testErr) defer patch1.Reset() - _, err := manager.getAssociatedLogicIDs(id1int32, id1int32, id1int32) + _, err := manager.GetAssociatedLogicIDs(id1int32, id1int32, id1int32) convey.So(err, convey.ShouldBeError) }) patch := gomonkey.ApplyMethodReturn(&devmanager.DeviceManagerMock{}, "GetBrotherCardID", @@ -582,13 +582,13 @@ func TestGetAssociatedLogicIDs(t *testing.T) { patch1 := gomonkey.ApplyMethodReturn(&devmanager.DeviceManagerMock{}, "GetDeviceLogicID", id1int32, testErr) defer patch1.Reset() - _, err := manager.getAssociatedLogicIDs(id1int32, id1int32, id1int32) + _, err := manager.GetAssociatedLogicIDs(id1int32, id1int32, id1int32) convey.So(err, convey.ShouldBeError) }) patch.ApplyMethodReturn(&devmanager.DeviceManagerMock{}, "GetDeviceLogicID", id1int32, nil) convey.Convey("03-success, should return nil", func() { - _, err := manager.getAssociatedLogicIDs(id1int32, id1int32, id1int32) + _, err := manager.GetAssociatedLogicIDs(id1int32, id1int32, id1int32) convey.So(err, convey.ShouldBeNil) }) }) @@ -1029,7 +1029,7 @@ func TestGetA3LogicMapByAssociation(t *testing.T) { int32(id1), int32(id1), nil) defer patch.Reset() convey.Convey("03-get associated card failed, should return error", func() { - patch1 := gomonkey.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch1 := gomonkey.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return nil, testErr }) @@ -1037,7 +1037,7 @@ func TestGetA3LogicMapByAssociation(t *testing.T) { _, err := manager.getA3LogicMapByAssociation(devs) convey.So(err, convey.ShouldBeError) }) - patch.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return []int32{int32(id1)}, nil }) @@ -1465,7 +1465,7 @@ func TestGetResetIndexForA3(t *testing.T) { int32(id1), int32(id1), nil) defer patch.Reset() convey.Convey("02-get card id device id failed, should return error", func() { - patch1 := gomonkey.ApplyFuncReturn((*HwAscend910Manager).getAssociatedLogicIDs, []int32{}, nil) + patch1 := gomonkey.ApplyFuncReturn((*HwAscend910Manager).GetAssociatedLogicIDs, []int32{}, nil) defer patch1.Reset() retId, err := manager.getResetIndexForA3(chipPhyID0) convey.So(err.Error(), convey.ShouldEqual, "sort logic ids failed, logic ids [], sorted ids []") @@ -1473,7 +1473,7 @@ func TestGetResetIndexForA3(t *testing.T) { }) convey.Convey("03-get reset index success, should return nil", func() { - patch1 := gomonkey.ApplyFuncReturn((*HwAscend910Manager).getAssociatedLogicIDs, []int32{id1, id2, id3}, nil) + patch1 := gomonkey.ApplyFuncReturn((*HwAscend910Manager).GetAssociatedLogicIDs, []int32{id1, id2, id3}, nil) defer patch1.Reset() retId, err := manager.getResetIndexForA3(chipPhyID0) convey.So(err, convey.ShouldBeNil) @@ -2890,7 +2890,7 @@ func TestFilterDevStatusForA3(t *testing.T) { convey.Convey("test TestFilterDevStatusForA3", t, func() { convey.Convey("01-get associated card error, should return error", func() { patch1 := gomonkey.ApplyFuncReturn(IsDevBusy, false) - patch1.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch1.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return []int32{id1}, testErr }) @@ -2900,7 +2900,7 @@ func TestFilterDevStatusForA3(t *testing.T) { }) convey.Convey("02-success, should return nil", func() { patch1 := gomonkey.ApplyFuncReturn(IsDevBusy, false) - patch1.ApplyPrivateMethod(manager, "getAssociatedLogicIDs", + patch1.ApplyPrivateMethod(manager, "GetAssociatedLogicIDs", func(logicID, cardID, deviceID int32) ([]int32, error) { return []int32{id1}, nil }) diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon.go b/component/ascend-device-plugin/pkg/device/ascendcommon.go index 8f7cabff6..f68bd0b6e 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon.go @@ -130,6 +130,7 @@ type DevManager interface { GetUsedChips() sets.String GetDeviceIP(deviceType string, phyID int) (string, error) WriteFaultToEvent(ctx context.Context) + GetAssociatedLogicIDs(logicID, cardID, deviceID int32) ([]int32, error) } // SetDmgr set devmanager diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 038f15fa4..cae69a967 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -698,8 +698,12 @@ func (hdm *HwDevManager) resetCommonInferCard(devType string, devices []*common. return } - // A800IA2 server, node labeled with server-usage=infer + // A800IA2、A800IA3 server, node labeled with server-usage=infer if usage == common.Infer { + if common.ParamOption.RealCardType == common.Ascend910A3 { + hdm.ResetServerForA3(devType, devices, prClient) + return + } // server without hccs is 0x33 or 0x3c if boardId == common.A800IA2NoneHccsBoardId || boardId == common.A800IA2NoneHccsBoardIdOld { hdm.ResetWithoutHccsServer(devType, devices, prClient) @@ -793,6 +797,51 @@ func (hdm *HwDevManager) ResetHccsServer(devType string, devices []*common.NpuDe } } +func (hdm *HwDevManager) ResetServerForA3(devType string, devices []*common.NpuDevice, prClient *PodResource) { + coverIdSet := sets.NewInt32() + for _, npuDevice := range devices { + if npuDevice.Health == v1beta1.Healthy || coverIdSet.Has(npuDevice.LogicID) { + continue + } + cardID, deviceID, err := hdm.manager.GetDmgr().GetCardIDDeviceID(npuDevice.LogicID) + if err != nil { + hwlog.RunLog.Errorf("get card id and device id failed, logic id: %d err: %v", + npuDevice.LogicID, err) + continue + } + logicIDs, err := hdm.manager.GetAssociatedLogicIDs(npuDevice.LogicID, cardID, deviceID) + if err != nil { + hwlog.RunLog.Errorf("get associated logic id list failed, logic id: %d err: %v", + npuDevice.LogicID, err) + continue + } + idSet := sets.NewInt32(logicIDs...) + idx := int32(-1) + freeDeviceNum := 0 + for _, dev := range devices { + if !idSet.Has(dev.LogicID) { + continue + } + if dev.LogicID < idx { + idx = dev.LogicID + } + if !hdm.isPodRemove(devType, dev, prClient) { + break + } + freeDeviceNum++ + } + inReset := hdm.manager.GetIfCardsInResetting(idx) + resetFailedTimes := hdm.manager.GetResetFailedTimes(idx) + + if idx >= 0 && !inReset && resetFailedTimes < common.MaxResetTimes && freeDeviceNum == len(logicIDs) { + hdm.manager.SetCardsInResetting(idx, true) + // to avoid blocking for minutes + go hdm.hotReset(npuDevice) + } + coverIdSet.Insert(logicIDs...) + } +} + func (hdm *HwDevManager) resetDuoCard(devType string, devices []*common.NpuDevice, prClient *PodResource) { var cardResetOnce = make(map[int32][]*common.NpuDevice, 1) for _, device := range devices { -- Gitee From b970f37a13348344ebb258de51999253e5acb1cf Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Tue, 19 Aug 2025 22:45:18 +0800 Subject: [PATCH 02/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/manager.go | 22 ++++----- .../pkg/server/manager_test.go | 49 +++++++++++++++++++ 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index cae69a967..d2d877413 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -691,6 +691,10 @@ func (hdm *HwDevManager) resetCommonInferCard(devType string, devices []*common. hwlog.RunLog.Error("invalid params") return } + if common.ParamOption.RealCardType == common.Ascend910A3 { + hdm.ResetServerForA3(devType, devices, prClient) + return + } usage, boardId, err := hdm.getServerUsageAndBoardId() if err != nil { @@ -698,12 +702,8 @@ func (hdm *HwDevManager) resetCommonInferCard(devType string, devices []*common. return } - // A800IA2、A800IA3 server, node labeled with server-usage=infer + // A800IA2 server, node labeled with server-usage=infer if usage == common.Infer { - if common.ParamOption.RealCardType == common.Ascend910A3 { - hdm.ResetServerForA3(devType, devices, prClient) - return - } // server without hccs is 0x33 or 0x3c if boardId == common.A800IA2NoneHccsBoardId || boardId == common.A800IA2NoneHccsBoardIdOld { hdm.ResetWithoutHccsServer(devType, devices, prClient) @@ -810,13 +810,12 @@ func (hdm *HwDevManager) ResetServerForA3(devType string, devices []*common.NpuD continue } logicIDs, err := hdm.manager.GetAssociatedLogicIDs(npuDevice.LogicID, cardID, deviceID) - if err != nil { - hwlog.RunLog.Errorf("get associated logic id list failed, logic id: %d err: %v", - npuDevice.LogicID, err) + if err != nil || len(logicIDs) == 0 { + hwlog.RunLog.Errorf("invalid associated logic id list %v, err: %v", logicIDs, err) continue } idSet := sets.NewInt32(logicIDs...) - idx := int32(-1) + idx := logicIDs[0] freeDeviceNum := 0 for _, dev := range devices { if !idSet.Has(dev.LogicID) { @@ -832,8 +831,9 @@ func (hdm *HwDevManager) ResetServerForA3(devType string, devices []*common.NpuD } inReset := hdm.manager.GetIfCardsInResetting(idx) resetFailedTimes := hdm.manager.GetResetFailedTimes(idx) - - if idx >= 0 && !inReset && resetFailedTimes < common.MaxResetTimes && freeDeviceNum == len(logicIDs) { + hwlog.RunLog.Debugf("logicIDs:%v, idx:%v, inreset:%v reset failed times:%v freeDevNum:%v", + logicIDs, idx, inReset, resetFailedTimes, freeDeviceNum) + if !inReset && resetFailedTimes < common.MaxResetTimes && freeDeviceNum == len(logicIDs) { hdm.manager.SetCardsInResetting(idx, true) // to avoid blocking for minutes go hdm.hotReset(npuDevice) diff --git a/component/ascend-device-plugin/pkg/server/manager_test.go b/component/ascend-device-plugin/pkg/server/manager_test.go index 176df5dea..6f4849dbd 100644 --- a/component/ascend-device-plugin/pkg/server/manager_test.go +++ b/component/ascend-device-plugin/pkg/server/manager_test.go @@ -895,6 +895,13 @@ func TestResetCommonInferCard(t *testing.T) { devices := []*common.NpuDevice{{DeviceName: "device1", Health: v1beta1.Healthy}, {DeviceName: "device2", Health: v1beta1.Unhealthy}} convey.Convey("Test resetCommonInferCard", t, func() { + convey.Convey("card type is A3", func() { + patch := gomonkey.ApplyGlobalVar(&common.ParamOption.RealCardType, common.Ascend910A3) + defer patch.Reset() + npuDeviceList := []*common.NpuDevice{{LogicID: 0, Health: v1beta1.Healthy}} + hdm.resetCommonInferCard("devType", npuDeviceList, nil) + convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(0)) + }) convey.Convey("When hdm is nil or allInfo.AllDevs is empty, log error and return", func() { tmpAllDevs := hdm.allInfo.AllDevs hdm.allInfo.AllDevs = []common.NpuDevice{} @@ -933,6 +940,48 @@ func TestResetCommonInferCard(t *testing.T) { }) } +func TestResetServerForA3(t *testing.T) { + hdm := &HwDevManager{manager: device.NewHwAscend910Manager()} + convey.Convey("Test ResetServerForA3", t, func() { + logicList := []int32{0, 1, 8, 9} + npuDevices := []*common.NpuDevice{ + {LogicID: 9}, {LogicID: 8, Health: v1beta1.Healthy}, {LogicID: 1}, {LogicID: 0}} + patch := gomonkey.ApplyMethodReturn(hdm.manager, "GetDmgr", &devmanager.DeviceManager{}). + ApplyMethod(hdm.manager, "GetAssociatedLogicIDs", + func(_ *device.HwAscend910Manager, logicID, _, _ int32) ([]int32, error) { + return logicList, nil + }). + ApplyFunc(wait.PollImmediate, func(_, _ time.Duration, _ wait.ConditionFunc) error { + time.Sleep(time.Second) + return nil + }) + defer patch.Reset() + convey.Convey("get card id device id failed, should not reset device", func() { + patch.ApplyMethodReturn(hdm.manager.GetDmgr(), "GetCardIDDeviceID", + int32(0), int32(0), errors.New("getCardIDDeviceID error")) + hdm.ResetServerForA3("devType", npuDevices, nil) + convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(0)) + }) + convey.Convey("get logic id list success and pod already remove, should reset device", func() { + patch.ApplyMethodReturn(hdm.manager.GetDmgr(), "GetCardIDDeviceID", int32(0), int32(0), nil). + ApplyPrivateMethod(hdm, "isPodRemove", + func(*HwDevManager, string, *common.NpuDevice, *PodResource) bool { return true }) + hdm.ResetServerForA3("devType", npuDevices, nil) + convey.ShouldBeTrue(hdm.manager.GetIfCardsInResetting(0)) + convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(9)) + }) + convey.Convey("pod not remove, should not reset device", func() { + hdm.manager.SetCardsInResetting(0, false) + patch.ApplyMethodReturn(hdm.manager.GetDmgr(), "GetCardIDDeviceID", int32(0), int32(0), nil). + ApplyPrivateMethod(hdm, "isPodRemove", + func(*HwDevManager, string, *common.NpuDevice, *PodResource) bool { return false }) + hdm.ResetServerForA3("devType", npuDevices, nil) + convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(0)) + convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(9)) + }) + }) +} + // TestExecResetChip tests the execResetChip function. func TestExecResetChip(t *testing.T) { hdm := &HwDevManager{manager: &device.HwAscend310Manager{}} -- Gitee From fdb59a0fa96690ad53c9666a11b5020a29209085 Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Wed, 20 Aug 2025 09:43:25 +0800 Subject: [PATCH 03/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascend-device-plugin/pkg/server/manager_test.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/manager_test.go b/component/ascend-device-plugin/pkg/server/manager_test.go index 6f4849dbd..a965f33f1 100644 --- a/component/ascend-device-plugin/pkg/server/manager_test.go +++ b/component/ascend-device-plugin/pkg/server/manager_test.go @@ -948,13 +948,10 @@ func TestResetServerForA3(t *testing.T) { {LogicID: 9}, {LogicID: 8, Health: v1beta1.Healthy}, {LogicID: 1}, {LogicID: 0}} patch := gomonkey.ApplyMethodReturn(hdm.manager, "GetDmgr", &devmanager.DeviceManager{}). ApplyMethod(hdm.manager, "GetAssociatedLogicIDs", - func(_ *device.HwAscend910Manager, logicID, _, _ int32) ([]int32, error) { + func(_ *device.HwAscend910Manager, _, _, _ int32) ([]int32, error) { return logicList, nil }). - ApplyFunc(wait.PollImmediate, func(_, _ time.Duration, _ wait.ConditionFunc) error { - time.Sleep(time.Second) - return nil - }) + ApplyFuncReturn(wait.PollImmediate, nil) defer patch.Reset() convey.Convey("get card id device id failed, should not reset device", func() { patch.ApplyMethodReturn(hdm.manager.GetDmgr(), "GetCardIDDeviceID", @@ -963,12 +960,12 @@ func TestResetServerForA3(t *testing.T) { convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(0)) }) convey.Convey("get logic id list success and pod already remove, should reset device", func() { + hdm.manager.SetCardsInResetting(0, false) patch.ApplyMethodReturn(hdm.manager.GetDmgr(), "GetCardIDDeviceID", int32(0), int32(0), nil). ApplyPrivateMethod(hdm, "isPodRemove", func(*HwDevManager, string, *common.NpuDevice, *PodResource) bool { return true }) hdm.ResetServerForA3("devType", npuDevices, nil) convey.ShouldBeTrue(hdm.manager.GetIfCardsInResetting(0)) - convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(9)) }) convey.Convey("pod not remove, should not reset device", func() { hdm.manager.SetCardsInResetting(0, false) @@ -977,7 +974,6 @@ func TestResetServerForA3(t *testing.T) { func(*HwDevManager, string, *common.NpuDevice, *PodResource) bool { return false }) hdm.ResetServerForA3("devType", npuDevices, nil) convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(0)) - convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(9)) }) }) } -- Gitee From c6460fe18cca1fb13ce1e3350437ad2387592ca8 Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Wed, 20 Aug 2025 11:46:08 +0800 Subject: [PATCH 04/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/ascend-device-plugin/pkg/server/manager_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/component/ascend-device-plugin/pkg/server/manager_test.go b/component/ascend-device-plugin/pkg/server/manager_test.go index a965f33f1..8f7e6a2f0 100644 --- a/component/ascend-device-plugin/pkg/server/manager_test.go +++ b/component/ascend-device-plugin/pkg/server/manager_test.go @@ -951,7 +951,9 @@ func TestResetServerForA3(t *testing.T) { func(_ *device.HwAscend910Manager, _, _, _ int32) ([]int32, error) { return logicList, nil }). - ApplyFuncReturn(wait.PollImmediate, nil) + ApplyFuncReturn(wait.PollImmediate, nil). + ApplyPrivateMethod(hdm, "hotReset", func(*HwDevManager, *common.NpuDevice) {}) + defer patch.Reset() convey.Convey("get card id device id failed, should not reset device", func() { patch.ApplyMethodReturn(hdm.manager.GetDmgr(), "GetCardIDDeviceID", -- Gitee From f1810bd62716b83fad96534f4c10f61ac9bd6eef Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Wed, 20 Aug 2025 14:41:41 +0800 Subject: [PATCH 05/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/ascend-device-plugin/pkg/server/manager_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/manager_test.go b/component/ascend-device-plugin/pkg/server/manager_test.go index 8f7e6a2f0..65cfebab5 100644 --- a/component/ascend-device-plugin/pkg/server/manager_test.go +++ b/component/ascend-device-plugin/pkg/server/manager_test.go @@ -946,13 +946,13 @@ func TestResetServerForA3(t *testing.T) { logicList := []int32{0, 1, 8, 9} npuDevices := []*common.NpuDevice{ {LogicID: 9}, {LogicID: 8, Health: v1beta1.Healthy}, {LogicID: 1}, {LogicID: 0}} + sleepTime := 300 * time.Millisecond patch := gomonkey.ApplyMethodReturn(hdm.manager, "GetDmgr", &devmanager.DeviceManager{}). ApplyMethod(hdm.manager, "GetAssociatedLogicIDs", func(_ *device.HwAscend910Manager, _, _, _ int32) ([]int32, error) { return logicList, nil }). - ApplyFuncReturn(wait.PollImmediate, nil). - ApplyPrivateMethod(hdm, "hotReset", func(*HwDevManager, *common.NpuDevice) {}) + ApplyFuncReturn(wait.PollImmediate, nil) defer patch.Reset() convey.Convey("get card id device id failed, should not reset device", func() { @@ -977,6 +977,7 @@ func TestResetServerForA3(t *testing.T) { hdm.ResetServerForA3("devType", npuDevices, nil) convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(0)) }) + time.Sleep(sleepTime) }) } -- Gitee From a4628d11c53ddb88f340adc6a27ce4128985c25a Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Wed, 20 Aug 2025 14:56:36 +0800 Subject: [PATCH 06/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/device/ascendcommon.go | 12 +++++++----- .../pkg/device/ascendcommon_test.go | 14 ++++++++++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon.go b/component/ascend-device-plugin/pkg/device/ascendcommon.go index f68bd0b6e..b96715a43 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon.go @@ -1230,17 +1230,19 @@ func (tool *AscendTools) SetDeviceUsage(devLogicID int32) error { hwlog.RunLog.Errorf("failed to get node %s info, err: %s", tool.client.NodeName, err.Error()) return fmt.Errorf("failed to get node info") } - // A800IA2 with has to label the node as server-usage:infer to divide with A800T - if serverUsage, ok := node.Labels[common.ServerUsageLabelKey]; ok && serverUsage == common.Infer { - tool.deviceUsage = common.Infer - return nil - } boardId, err := tool.GetServerBoardId(devLogicID) if err != nil { hwlog.RunLog.Errorf("%v", err) return fmt.Errorf("set device usage error") } + + // A800IA2 with has to label the node as server-usage:infer to divide with A800T + if serverUsage, ok := node.Labels[common.ServerUsageLabelKey]; ok && serverUsage == common.Infer { + tool.deviceUsage = common.Infer + return nil + } + // A800IA2 without hccs can be auto set usage as infer if devType == common.Ascend910B && (boardId == common.A300IA2BoardId || boardId == common.A800IA2NoneHccsBoardId || boardId == common.A800IA2NoneHccsBoardIdOld) { diff --git a/component/ascend-device-plugin/pkg/device/ascendcommon_test.go b/component/ascend-device-plugin/pkg/device/ascendcommon_test.go index 9477fd0f9..01fa8e9a6 100644 --- a/component/ascend-device-plugin/pkg/device/ascendcommon_test.go +++ b/component/ascend-device-plugin/pkg/device/ascendcommon_test.go @@ -398,12 +398,18 @@ func TestSetDeviceUsage(t *testing.T) { defer mockGetServerBoardIdMethod.Reset() convey.So(tool.SetDeviceUsage(devLoginID), convey.ShouldNotBeNil) }) - convey.Convey("04-get board success, should return nil", func() { + convey.Convey("get board success", func() { boardID := uint32(0x3c) - mockGetServerBoardIdMethod := gomonkey.ApplyMethod(reflect.TypeOf(new(AscendTools)), + patch := gomonkey.ApplyMethod(reflect.TypeOf(new(AscendTools)), "GetServerBoardId", func(_ *AscendTools, devLogicID int32) (uint32, error) { return boardID, nil }) - defer mockGetServerBoardIdMethod.Reset() - convey.So(tool.SetDeviceUsage(devLoginID), convey.ShouldBeNil) + defer patch.Reset() + convey.Convey("04-devType is not Ascend910B, should return nil", func() { + convey.So(tool.SetDeviceUsage(devLoginID), convey.ShouldBeNil) + }) + convey.Convey("05-devType is not Ascend910B, should return nil", func() { + patch.ApplyMethodReturn(tool.dmgr, "GetDevType", common.Ascend910B) + convey.So(tool.SetDeviceUsage(devLoginID), convey.ShouldBeNil) + }) }) }) } -- Gitee From f1e9180c077c854faee3304400a0fa6a3d3742af Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Wed, 20 Aug 2025 15:10:29 +0800 Subject: [PATCH 07/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/ascend-device-plugin/pkg/server/manager_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/manager_test.go b/component/ascend-device-plugin/pkg/server/manager_test.go index 65cfebab5..41dc1bee6 100644 --- a/component/ascend-device-plugin/pkg/server/manager_test.go +++ b/component/ascend-device-plugin/pkg/server/manager_test.go @@ -946,7 +946,7 @@ func TestResetServerForA3(t *testing.T) { logicList := []int32{0, 1, 8, 9} npuDevices := []*common.NpuDevice{ {LogicID: 9}, {LogicID: 8, Health: v1beta1.Healthy}, {LogicID: 1}, {LogicID: 0}} - sleepTime := 300 * time.Millisecond + sleepTime := 300 patch := gomonkey.ApplyMethodReturn(hdm.manager, "GetDmgr", &devmanager.DeviceManager{}). ApplyMethod(hdm.manager, "GetAssociatedLogicIDs", func(_ *device.HwAscend910Manager, _, _, _ int32) ([]int32, error) { @@ -977,7 +977,7 @@ func TestResetServerForA3(t *testing.T) { hdm.ResetServerForA3("devType", npuDevices, nil) convey.ShouldBeFalse(hdm.manager.GetIfCardsInResetting(0)) }) - time.Sleep(sleepTime) + time.Sleep(time.Duration(sleepTime) * time.Millisecond) }) } -- Gitee From 95469ff6276fb9e4e44d1dada48cbec8b59ee9a4 Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Wed, 20 Aug 2025 16:29:18 +0800 Subject: [PATCH 08/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/manager.go | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index d2d877413..69f848d7e 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -815,30 +815,31 @@ func (hdm *HwDevManager) ResetServerForA3(devType string, devices []*common.NpuD continue } idSet := sets.NewInt32(logicIDs...) - idx := logicIDs[0] + var startDevice *common.NpuDevice = nil freeDeviceNum := 0 for _, dev := range devices { if !idSet.Has(dev.LogicID) { continue } - if dev.LogicID < idx { - idx = dev.LogicID + if startDevice == nil || dev.LogicID < startDevice.LogicID { + startDevice = dev } - if !hdm.isPodRemove(devType, dev, prClient) { - break + if hdm.isPodRemove(devType, dev, prClient) { + freeDeviceNum++ } - freeDeviceNum++ - } - inReset := hdm.manager.GetIfCardsInResetting(idx) - resetFailedTimes := hdm.manager.GetResetFailedTimes(idx) - hwlog.RunLog.Debugf("logicIDs:%v, idx:%v, inreset:%v reset failed times:%v freeDevNum:%v", - logicIDs, idx, inReset, resetFailedTimes, freeDeviceNum) - if !inReset && resetFailedTimes < common.MaxResetTimes && freeDeviceNum == len(logicIDs) { - hdm.manager.SetCardsInResetting(idx, true) - // to avoid blocking for minutes - go hdm.hotReset(npuDevice) } - coverIdSet.Insert(logicIDs...) + if startDevice != nil { + inReset := hdm.manager.GetIfCardsInResetting(startDevice.LogicID) + resetFailedTimes := hdm.manager.GetResetFailedTimes(startDevice.LogicID) + hwlog.RunLog.Infof("logicIDs:%v, idx:%v, inreset:%v reset failed times:%v freeDevNum:%v", + logicIDs, startDevice.LogicID, inReset, resetFailedTimes, freeDeviceNum) + if !inReset && resetFailedTimes < common.MaxResetTimes && freeDeviceNum == len(logicIDs) { + hdm.manager.SetCardsInResetting(startDevice.LogicID, true) + // to avoid blocking for minutes + go hdm.hotReset(startDevice) + } + coverIdSet.Insert(logicIDs...) + } } } -- Gitee From ee1f5596d4c5036f89f3ff87be6332e4808fe18e Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Fri, 22 Aug 2025 15:33:06 +0800 Subject: [PATCH 09/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/server/manager.go | 90 +++++++++++-------- .../pkg/server/manager_test.go | 4 +- 2 files changed, 53 insertions(+), 41 deletions(-) diff --git a/component/ascend-device-plugin/pkg/server/manager.go b/component/ascend-device-plugin/pkg/server/manager.go index 69f848d7e..8d17ed90c 100644 --- a/component/ascend-device-plugin/pkg/server/manager.go +++ b/component/ascend-device-plugin/pkg/server/manager.go @@ -720,7 +720,7 @@ func (hdm *HwDevManager) resetCommonInferCard(devType string, devices []*common. if !hdm.isPodRemove(devType, device, prClient) { continue } - hdm.hotReset(device) + hdm.hotReset([]*common.NpuDevice{device}) } } @@ -751,9 +751,8 @@ func (hdm *HwDevManager) ResetWithoutHccsServer(devType string, devices []*commo resetFailedTimes := hdm.manager.GetResetFailedTimes(device.LogicID) if device.Health != v1beta1.Healthy && !inReset && resetFailedTimes < common.MaxResetTimes && hdm.isPodRemove(devType, device, prClient) { - hdm.manager.SetCardsInResetting(device.LogicID, true) // to avoid blocking for minutes - go hdm.hotReset(device) + go hdm.hotReset([]*common.NpuDevice{device}) } } } @@ -787,13 +786,12 @@ func (hdm *HwDevManager) ResetHccsServer(devType string, devices []*common.NpuDe } if needReset && freeDeviceNum == common.Ascend910BRingsNumTrain { - hdm.manager.SetCardsInResetting(common.FirstDevice, true) if common.FirstDevice >= len(devices) { hwlog.RunLog.Errorf("index out of range: giving devices index %d, "+ "real length %d", common.FirstDevice, len(devices)) return } - hdm.hotReset(devices[common.FirstDevice]) + hdm.hotReset(devices) } } @@ -815,31 +813,24 @@ func (hdm *HwDevManager) ResetServerForA3(devType string, devices []*common.NpuD continue } idSet := sets.NewInt32(logicIDs...) - var startDevice *common.NpuDevice = nil + deviceList := make([]*common.NpuDevice, 0, len(logicIDs)) freeDeviceNum := 0 for _, dev := range devices { if !idSet.Has(dev.LogicID) { continue } - if startDevice == nil || dev.LogicID < startDevice.LogicID { - startDevice = dev - } - if hdm.isPodRemove(devType, dev, prClient) { + deviceList = append(deviceList, dev) + inReset := hdm.manager.GetIfCardsInResetting(dev.LogicID) + resetFailedTimes := hdm.manager.GetResetFailedTimes(dev.LogicID) + if !inReset && resetFailedTimes < common.MaxResetTimes && hdm.isPodRemove(devType, dev, prClient) { freeDeviceNum++ } } - if startDevice != nil { - inReset := hdm.manager.GetIfCardsInResetting(startDevice.LogicID) - resetFailedTimes := hdm.manager.GetResetFailedTimes(startDevice.LogicID) - hwlog.RunLog.Infof("logicIDs:%v, idx:%v, inreset:%v reset failed times:%v freeDevNum:%v", - logicIDs, startDevice.LogicID, inReset, resetFailedTimes, freeDeviceNum) - if !inReset && resetFailedTimes < common.MaxResetTimes && freeDeviceNum == len(logicIDs) { - hdm.manager.SetCardsInResetting(startDevice.LogicID, true) - // to avoid blocking for minutes - go hdm.hotReset(startDevice) - } - coverIdSet.Insert(logicIDs...) + if freeDeviceNum == len(logicIDs) { + // to avoid blocking for minutes + go hdm.hotReset(deviceList) } + coverIdSet.Insert(logicIDs...) } } @@ -855,7 +846,7 @@ func (hdm *HwDevManager) resetDuoCard(devType string, devices []*common.NpuDevic if !hdm.isDuoRemove(devType, deviceChip, prClient) { continue } - hdm.hotReset(deviceChip[0]) + hdm.hotReset([]*common.NpuDevice{deviceChip[0]}) } } @@ -1122,33 +1113,54 @@ func (hdm *HwDevManager) updateSpecTypePodAnnotation(deviceType, serverID string return nil } -func (hdm *HwDevManager) hotReset(device *common.NpuDevice) { - hwlog.RunLog.Infof("will start to reset device %s", device.DeviceName) +func (hdm *HwDevManager) hotReset(devices []*common.NpuDevice) { + if len(devices) == 0 { + return + } + for _, dev := range devices { + hdm.manager.SetCardsInResetting(dev.LogicID, true) + hwlog.RunLog.Infof("will start to reset device %s", dev.DeviceName) + } var isResetExec = false + resetDoneSet := sets.NewInt32() if err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) { - if err := hdm.execResetChip(device.LogicID, &isResetExec); err != nil { - hwlog.RunLog.Errorf("get device boot status failed, err: %v", err) - return false, err + if !isResetExec { + if err := hdm.execResetChip(devices[0].LogicID, &isResetExec); err != nil { + hwlog.RunLog.Errorf("get device boot status failed, err: %v", err) + return false, err + } } - bootState, err := hdm.manager.GetDmgr().GetDeviceBootStatus(device.LogicID) - if err != nil { - hwlog.RunLog.Errorf("get device boot status failed, err: %v", err) - return false, err + for _, dev := range devices { + if resetDoneSet.Has(dev.LogicID) { + continue + } + bootState, err := hdm.manager.GetDmgr().GetDeviceBootStatus(dev.LogicID) + if err != nil { + hwlog.RunLog.Errorf("get device boot status failed, err: %v", err) + return false, err + } + if bootState != common.BootStartFinish { + hwlog.RunLog.Warnf("device bootState(%d), starting...", bootState) + return false, nil + } + resetDoneSet.Insert(dev.LogicID) } - if bootState != common.BootStartFinish { - hwlog.RunLog.Warnf("device bootState(%d), starting...", bootState) - return false, nil + for _, dev := range devices { + common.SetDeviceInit(dev.LogicID) } - common.SetDeviceInit(device.LogicID) return true, nil }); err != nil { hwlog.RunLog.Warnf("hot reset failed, timeout or err: %v", err) - hdm.manager.SetCardsInResetting(device.LogicID, false) - hdm.manager.SetResetFailedTimes(device.LogicID, hdm.manager.GetResetFailedTimes(device.LogicID)+1) + for _, dev := range devices { + hdm.manager.SetCardsInResetting(dev.LogicID, false) + hdm.manager.SetResetFailedTimes(dev.LogicID, hdm.manager.GetResetFailedTimes(dev.LogicID)+1) + } return } - hdm.manager.SetResetFailedTimes(device.LogicID, 0) - hdm.manager.SetCardsInResetting(device.LogicID, false) + for _, dev := range devices { + hdm.manager.SetResetFailedTimes(dev.LogicID, 0) + hdm.manager.SetCardsInResetting(dev.LogicID, false) + } hwlog.RunLog.Info("hot reset success") } diff --git a/component/ascend-device-plugin/pkg/server/manager_test.go b/component/ascend-device-plugin/pkg/server/manager_test.go index 41dc1bee6..410f9d7db 100644 --- a/component/ascend-device-plugin/pkg/server/manager_test.go +++ b/component/ascend-device-plugin/pkg/server/manager_test.go @@ -878,12 +878,12 @@ func TestHotReset(t *testing.T) { convey.Convey("When PollImmediate error log warn and return", func() { mockPollImmediate := gomonkey.ApplyFuncReturn(wait.PollImmediate, errors.New("error")) defer mockPollImmediate.Reset() - hdm.hotReset(npuDevice) + hdm.hotReset([]*common.NpuDevice{npuDevice}) }) mockPollImmediate := gomonkey.ApplyFuncReturn(wait.PollImmediate, nil) defer mockPollImmediate.Reset() convey.Convey("When PollImmediate return nil hot rest success", func() { - hdm.hotReset(npuDevice) + hdm.hotReset([]*common.NpuDevice{npuDevice}) }) }) } -- Gitee From 090d0be20062d0b4d7d09e1fa51d4362217772d0 Mon Sep 17 00:00:00 2001 From: zhoupan39 Date: Fri, 22 Aug 2025 16:05:53 +0800 Subject: [PATCH 10/10] =?UTF-8?q?=E3=80=90DevicePlugin=E3=80=91=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=E3=80=91A3=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E8=AE=BE=E5=A4=87=E9=80=82=E9=85=8Dhotreset=3D0?= =?UTF-8?q?=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/ascend-device-plugin/pkg/device/ascend910_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/component/ascend-device-plugin/pkg/device/ascend910_test.go b/component/ascend-device-plugin/pkg/device/ascend910_test.go index 4d35f569e..747b2e64a 100644 --- a/component/ascend-device-plugin/pkg/device/ascend910_test.go +++ b/component/ascend-device-plugin/pkg/device/ascend910_test.go @@ -1956,30 +1956,35 @@ func TestRunProcessTask(t *testing.T) { ApplyPrivateMethod(&HwAscend910Manager{}, "resetProcess", func(*HwAscend910Manager, string, *common.TaskResetInfo, map[string][]*common.NpuDevice) { return }) defer mockFunc.Reset() + const sleepTime = 50 * time.Millisecond convey.Convey("test runProcessTask", t, func() { resetInfo := &common.TaskResetInfo{RankList: make([]*common.TaskDevInfo, 0)} convey.Convey("01-policy level is 2, call restartRequestProcess, should return nil", func() { manager := createFake910Manager() manager.hotResetManager = &HotResetTools{} err := manager.runProcessTask(taskName, common.RestartRequestErrorLevel, resetInfo, nil) + time.Sleep(sleepTime) convey.So(err, convey.ShouldBeNil) }) convey.Convey("02-policy level is 3, call restartProcess, should return nil", func() { manager := createFake910Manager() manager.hotResetManager = &HotResetTools{} err := manager.runProcessTask(taskName, common.RestartErrorLevel, resetInfo, nil) + time.Sleep(sleepTime) convey.So(err, convey.ShouldBeNil) }) convey.Convey("03-policy level is 5, call resetProcess, should return nil", func() { manager := createFake910Manager() manager.hotResetManager = &HotResetTools{} err := manager.runProcessTask(taskName, common.ResetErrorLevel, resetInfo, nil) + time.Sleep(sleepTime) convey.So(err, convey.ShouldBeNil) }) convey.Convey("04-policy level is 6, call resetProcess, should return nil", func() { manager := createFake910Manager() manager.hotResetManager = &HotResetTools{} err := manager.runProcessTask(taskName, common.IsolateErrorLevel, resetInfo, nil) + time.Sleep(sleepTime) convey.So(err, convey.ShouldNotBeNil) }) }) -- Gitee