From c5273c3986085b64081c6d15173b638b40c1673b Mon Sep 17 00:00:00 2001
From: zhanjun <zhanjun6@hisilicon.com>
Date: Sun, 4 Feb 2024 09:38:43 +0800
Subject: [PATCH] add query device shut down status interface for v2.1.0

---
 torch_npu/csrc/InitNpuBindings.cpp            |  5 ++++
 torch_npu/csrc/core/npu/NPUBlockHandle.h      |  7 +++++
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 30 +++++++++++++++++++
 torch_npu/csrc/core/npu/NPUCachingAllocator.h |  5 ++++
 4 files changed, 47 insertions(+)
diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 863eb90cd2f..476e51067ee 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -65,6 +65,11 @@ PyObject* THPModule_npu_shutdown(PyObject* /* unused */)
         ASCEND_LOGE("NPUCachingAllocator::emptyCache failed err=:%s", e.what());
     }
 
+    // To prevent entering the insert_events method of tensor destruction after the device has been released,
+    // a state variable called "shutdown_stats" is set during the shutdown process.
+    ASCEND_LOGI("NPU shutdown NPUCachingAllocator setShutdownStats.");
+    c10_npu::NPUCachingAllocator::setShutdownStats();
+
     ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize.");
     c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize();
     if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) {
diff --git a/torch_npu/csrc/core/npu/NPUBlockHandle.h b/torch_npu/csrc/core/npu/NPUBlockHandle.h
index 1641b3414c9..f2d98ead9db 100644
--- a/torch_npu/csrc/core/npu/NPUBlockHandle.h
+++ b/torch_npu/csrc/core/npu/NPUBlockHandle.h
@@ -32,5 +32,12 @@ C10_NPU_API void* GetBlockPtr(const void *handle);
 /// @param [in] handle: the block handle to query size
 /// @return size: the device memory size managed by block
 C10_NPU_API size_t GetBlockSize(const void *handle);
+
+/// @ingroup torch_npu
+/// @brief Get device shut_down status of the block according to handle
+/// @param [in] handle: the block handle to query device shut down status
+/// @return size: true  : means device is shutdown
+///               false : means device is activate when query
+C10_NPU_API bool GetDeviceShutDownStatusByHandle(const void *handle);
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 87e3f96f474..2c601cf69f3 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -775,6 +775,9 @@ class DeviceCachingAllocator {
 
   bool set_fraction = false;
 
+  // whether shutdown
+  bool shutdown_stats = false;
+
  public:
 
   DeviceCachingAllocator() :
@@ -1079,6 +1082,18 @@ class DeviceCachingAllocator {
     release_cached_blocks(check_error);
   }
 
+  /** set device shut down status is true **/
+  void setShutdownStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    shutdown_stats = true;
+  }
+
+  /** set device shut down status is true **/
+  bool getShutdownStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    return shutdown_stats;
+  }
+
   /** Retrieves info (total size + largest block) of the memory cache **/
   void cacheInfo(size_t* total, size_t* largest) {
     std::lock_guard<std::recursive_mutex> lock(mutex);
@@ -2155,6 +2170,13 @@ class NpuCachingAllocator : public NPUAllocator {
   {
     return "native";
   }
+
+  void setShutdownStats() {
+    auto count = device_allocator.size();
+    for (size_t i = 0U; i < count; ++i) {
+      device_allocator[i]->setShutdownStats();
+    }
+  }
 };
 
 NpuCachingAllocator caching_allocator;
@@ -2201,6 +2223,14 @@ size_t GetBlockSize(const void *handle) {
   return block->size;
 }
 
+bool GetDeviceShutDownStatusByHandle(const void *handle) {
+  const Block *block = reinterpret_cast<const Block *>(handle);
+  AT_ASSERT(block);
+  caching_allocator.assertValidDevice(block->device);
+  AT_ASSERT(caching_allocator.device_allocator[block->device]);
+  return caching_allocator.device_allocator[block->device]->getShutdownStats();
+}
+
 struct BackendStaticInitializer {
     BackendStaticInitializer()
     {
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index fe9d63061eb..35c57d351f7 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -136,6 +136,7 @@ public:
     virtual std::vector<SegmentInfo> snapshot() = 0;
     virtual void FreeDeviceCachedMemory(int device) = 0;
     virtual std::string name() = 0;
+    virtual void setShutdownStats() = 0;
 };
 
 // Allocator object, statically initialized
@@ -234,5 +235,9 @@ inline std::string name()
     return get()->name();
 }
 
+void setShutdownStats() {
+  get()->setShutdownStats();
+}
+
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
-- 
Gitee