From 001790e7e93568aa78e4263051146630bbd9576f Mon Sep 17 00:00:00 2001
From: weili10 <liwei386@huawei.com>
Date: Sat, 16 Aug 2025 17:00:28 +0800
Subject: [PATCH 1/2] resolve deadlock and device inconsistent problem.

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp |  6 +++++-
 torch_npu/csrc/core/npu/NPUQueue.cpp            | 12 ++++++++++--
 torch_npu/csrc/framework/OpCommand.cpp          |  3 +++
 torch_npu/csrc/framework/OpCommand.h            |  2 ++
 4 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 99e326290a..9477538c65 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -2843,6 +2843,8 @@ private:
 
     void insert_events(Block *block)
     {
+        int pre_device = -1;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&pre_device));
         aclrtContext compiler_ctx = aclrtContext();
         aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx);
         NPU_CHECK_ERROR(aclrtSetCurrentContext(c10_npu::GetDeviceContext(block->device)));
@@ -2860,7 +2862,9 @@ private:
             npu_events[stream].emplace_back(std::move(event), block);
         }
         if (ret_ctx == ACL_ERROR_NONE) {
-            NPU_CHECK_ERROR(aclrtSetCurrentContext(compiler_ctx));
+            NPU_CHECK_ERROR(aclrtSetCurrentContext(compiler_ctx)); 
+            // Setting context will exchange device implicitly, so we need to reset the cached device here to ensure consistency.
+            NPU_CHECK_ERROR(c10_npu::SetDevice(pre_device));
         }
     }
 
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 579514ab37..525537e349 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -5,6 +5,7 @@
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/framework/OpParamMaker.h"
+#include "torch_npu/csrc/framework/OpCommand.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/core/npu/NPUEventManager.h"
 
@@ -249,7 +250,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     // occur.
 #ifndef BUILD_LIBTORCH
     PyThreadState *gilState = nullptr;
-    if (PyGILState_Check() != 0) {
+    if (PyGILState_Check() != 0 && g_used_aclop) {
         gilState = PyEval_SaveThread();
     }
 #endif
@@ -531,7 +532,14 @@ void Repository::Enqueue(void *cur_paras)
             if (IsFullQueue()) {
 #ifndef BUILD_LIBTORCH
                 // double check the current thread hold a Gil lock
-                if (PyGILState_Check() != 0) {
+                // and release the GIL to TE op compiler in case the acl thread deadlock.
+                // However, this operator could produce another form of deadlock. 
+                // When thread A deconstract a tensor, it will hold the mutex of deviceCachingAllocator and insert an event into the taskqueue.
+                // If the taskqueue is full, thead A will run into here and release the GIL.
+                // Once another thread B get GIL and trigger GC, it may deconstract another tensor
+                // and try to get deviceCachingAllocator's mutex, which would cause another form of deadlock.
+                // Since the aclop will be deprecated soon, we just add a using-aclop check here to aviod the second case of deadlock.
+                if (PyGILState_Check() != 0 && g_used_aclop) {
                     Py_BEGIN_ALLOW_THREADS s = eventfd_read(efd_write, &u);
                     Py_END_ALLOW_THREADS
                 } else {
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 6b98651c51..59022f09e6 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -33,6 +33,8 @@ static std::unordered_map<at::ScalarType, std::vector<long>> integral_limits_map
     {at::ScalarType::Short, {std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()}}};
 } // namespace
 
+std::atomic<bool> g_used_aclop{false};
+
 namespace at_npu {
 namespace native {
 
@@ -124,6 +126,7 @@ void OpCommand::Run()
 {
     // Check for npu graph
     if (aclCmd->CheckCustomHandlerNull()) {
+        g_used_aclop = true;
         c10_npu::assertNotCapturingAclop(aclCmd->GetName());
     }
 
diff --git a/torch_npu/csrc/framework/OpCommand.h b/torch_npu/csrc/framework/OpCommand.h
index e60617077c..f30d9fb498 100644
--- a/torch_npu/csrc/framework/OpCommand.h
+++ b/torch_npu/csrc/framework/OpCommand.h
@@ -10,6 +10,8 @@
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "torch_npu/csrc/framework/utils/NPUDefinition.h"
 
+extern std::atomic<bool> g_used_aclop;
+
 namespace at_npu {
 namespace native {
 
-- 
Gitee


From 73b6d915cb7ba063fe45631f1ffda92004970255 Mon Sep 17 00:00:00 2001
From: weili10 <liwei386@huawei.com>
Date: Wed, 20 Aug 2025 11:49:40 +0800
Subject: [PATCH 2/2] update version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 925c57ac7e..b23c332eb7 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ from wheel.bdist_wheel import bdist_wheel
 
 BASE_DIR = os.path.dirname(os.path.realpath(__file__))
 THIRD_PARTY_PATH = os.path.join(BASE_DIR, "third_party")
-VERSION = '2.1.0.post14'
+VERSION = '2.1.0.post16'
 UNKNOWN = "Unknown"
 BUILD_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP
 
-- 
Gitee