From 001790e7e93568aa78e4263051146630bbd9576f Mon Sep 17 00:00:00 2001 From: weili10 Date: Sat, 16 Aug 2025 17:00:28 +0800 Subject: [PATCH 1/2] resolve deadlock and device inconsistent problem. --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 6 +++++- torch_npu/csrc/core/npu/NPUQueue.cpp | 12 ++++++++++-- torch_npu/csrc/framework/OpCommand.cpp | 3 +++ torch_npu/csrc/framework/OpCommand.h | 2 ++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 99e326290a..9477538c65 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -2843,6 +2843,8 @@ private: void insert_events(Block *block) { + int pre_device = -1; + NPU_CHECK_ERROR(c10_npu::GetDevice(&pre_device)); aclrtContext compiler_ctx = aclrtContext(); aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx); NPU_CHECK_ERROR(aclrtSetCurrentContext(c10_npu::GetDeviceContext(block->device))); @@ -2860,7 +2862,9 @@ private: npu_events[stream].emplace_back(std::move(event), block); } if (ret_ctx == ACL_ERROR_NONE) { - NPU_CHECK_ERROR(aclrtSetCurrentContext(compiler_ctx)); + NPU_CHECK_ERROR(aclrtSetCurrentContext(compiler_ctx)); + // Setting context will exchange device implicitly, so we need to reset the cached device here to ensure consistency. + NPU_CHECK_ERROR(c10_npu::SetDevice(pre_device)); } } diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 579514ab37..525537e349 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -5,6 +5,7 @@ #include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/framework/OpParamMaker.h" +#include "torch_npu/csrc/framework/OpCommand.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" #include "torch_npu/csrc/core/npu/NPUEventManager.h" @@ -249,7 +250,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) // occur. #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; - if (PyGILState_Check() != 0) { + if (PyGILState_Check() != 0 && g_used_aclop) { gilState = PyEval_SaveThread(); } #endif @@ -531,7 +532,14 @@ void Repository::Enqueue(void *cur_paras) if (IsFullQueue()) { #ifndef BUILD_LIBTORCH // double check the current thread hold a Gil lock - if (PyGILState_Check() != 0) { + // and release the GIL to TE op compiler in case the acl thread deadlock. + // However, this operator could produce another form of deadlock. + // When thread A deconstract a tensor, it will hold the mutex of deviceCachingAllocator and insert an event into the taskqueue. + // If the taskqueue is full, thead A will run into here and release the GIL. + // Once another thread B get GIL and trigger GC, it may deconstract another tensor + // and try to get deviceCachingAllocator's mutex, which would cause another form of deadlock. + // Since the aclop will be deprecated soon, we just add a using-aclop check here to aviod the second case of deadlock. + if (PyGILState_Check() != 0 && g_used_aclop) { Py_BEGIN_ALLOW_THREADS s = eventfd_read(efd_write, &u); Py_END_ALLOW_THREADS } else { diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp index 6b98651c51..59022f09e6 100644 --- a/torch_npu/csrc/framework/OpCommand.cpp +++ b/torch_npu/csrc/framework/OpCommand.cpp @@ -33,6 +33,8 @@ static std::unordered_map> integral_limits_map {at::ScalarType::Short, {std::numeric_limits::max(), std::numeric_limits::min()}}}; } // namespace +std::atomic g_used_aclop{false}; + namespace at_npu { namespace native { @@ -124,6 +126,7 @@ void OpCommand::Run() { // Check for npu graph if (aclCmd->CheckCustomHandlerNull()) { + g_used_aclop = true; c10_npu::assertNotCapturingAclop(aclCmd->GetName()); } diff --git a/torch_npu/csrc/framework/OpCommand.h b/torch_npu/csrc/framework/OpCommand.h index e60617077c..f30d9fb498 100644 --- a/torch_npu/csrc/framework/OpCommand.h +++ b/torch_npu/csrc/framework/OpCommand.h @@ -10,6 +10,8 @@ #include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "torch_npu/csrc/framework/utils/NPUDefinition.h" +extern std::atomic g_used_aclop; + namespace at_npu { namespace native { -- Gitee From 73b6d915cb7ba063fe45631f1ffda92004970255 Mon Sep 17 00:00:00 2001 From: weili10 Date: Wed, 20 Aug 2025 11:49:40 +0800 Subject: [PATCH 2/2] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 925c57ac7e..b23c332eb7 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ from wheel.bdist_wheel import bdist_wheel BASE_DIR = os.path.dirname(os.path.realpath(__file__)) THIRD_PARTY_PATH = os.path.join(BASE_DIR, "third_party") -VERSION = '2.1.0.post14' +VERSION = '2.1.0.post16' UNKNOWN = "Unknown" BUILD_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP -- Gitee