diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 1f8cd124284b1881d30495779054a9a0a586d59c..45f821fe96c57a77913c9e3a700d42f853ba07d1 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -446,6 +446,17 @@ bool Repository::ReadQueue() } ClearQueue(); c10_npu::NPUEventManager::GetInstance().ClearUnrecordedCount(); + if (GetStatus() == RepoStatus::STOP_EXIT) { + // The "stop_device" function will first set the "FORCE STOP" state, and then call the "devicetaskabort" interface. + // In a theoretical scenario, it is possible that before setting the FORCE STOP state, + // the dequeue thread had already got a task and was preparing to dispatch it. + // After calling the "devicetaskabort" interface, the task was finally ready to be dispatched. + // At this point, if the execution of this task fails, there will be an error state in the device, + // and it needs to be handled through the synchronization interface. + auto acl_ret = c10_npu::acl::AclrtSynchronizeDeviceWithTimeout(); + ASCEND_LOGI("ReadQueue: SynchronizeDevice with FORCE STOP, device = %d, write_idx = %u, read_idx = %u, ret = %d, acl_ret = %d", + device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret, acl_ret); + } return false; } @@ -459,6 +470,17 @@ bool Repository::ReadQueue() read_idx.idx = (read_idx.idx + 1) & (kQueueCapacity - 1); + if (GetStatus() == RepoStatus::STOP_EXIT) { + // The "stop_device" function will first set the "FORCE STOP" state, and then call the "devicetaskabort" interface. + // In a theoretical scenario, it is possible that before setting the FORCE STOP state, + // the dequeue thread had already got a task and was preparing to dispatch it. + // After calling the "devicetaskabort" interface, the task was finally ready to be dispatched. + // At this point, if the execution of this task fails, there will be an error state in the device, + // and it needs to be handled through the synchronization interface. + auto acl_ret = c10_npu::acl::AclrtSynchronizeDeviceWithTimeout(); + ASCEND_LOGI("ReadQueue: SynchronizeDevice with FORCE STOP, device = %d, write_idx = %u, read_idx = %u, ret = %d, acl_ret = %d", + device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret, acl_ret); + } return true; } diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index dd0f006a17486efe71f83631d07a61643c843aea..d218a480457bee961ea6f9fcfa1fac2d2b6b8db9 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -636,9 +636,10 @@ PyObject* THNPModule_stopDevice_wrap(PyObject* self, PyObject* arg) { HANDLE_TH_ERRORS int device = THPUtils_unpackLong(arg); + ASCEND_LOGI("NPU stop device start, device is %d.", device); setDefaultStreamsStatus(device, c10_npu::RepoStatus::STOP_EXIT); int ret = c10_npu::acl::AclrtDeviceTaskAbort(device); - ASCEND_LOGI("NPU stop device success, device is %d, ret is %d.", device, ret); + ASCEND_LOGI("NPU stop device end, device is %d, ret is %d.", device, ret); if (ret == 0) { return PyLong_FromLong(0); } else {