From c5273c3986085b64081c6d15173b638b40c1673b Mon Sep 17 00:00:00 2001 From: zhanjun Date: Sun, 4 Feb 2024 09:38:43 +0800 Subject: [PATCH] add query device shut down status interface for v2.1.0 --- torch_npu/csrc/InitNpuBindings.cpp | 5 ++++ torch_npu/csrc/core/npu/NPUBlockHandle.h | 7 +++++ .../csrc/core/npu/NPUCachingAllocator.cpp | 30 +++++++++++++++++++ torch_npu/csrc/core/npu/NPUCachingAllocator.h | 5 ++++ 4 files changed, 47 insertions(+) diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 863eb90cd2f..476e51067ee 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -65,6 +65,11 @@ PyObject* THPModule_npu_shutdown(PyObject* /* unused */) ASCEND_LOGE("NPUCachingAllocator::emptyCache failed err=:%s", e.what()); } + // To prevent entering the insert_events method of tensor destruction after the device has been released, + // a state variable called "shutdown_stats" is set during the shutdown process. + ASCEND_LOGI("NPU shutdown NPUCachingAllocator setShutdownStats."); + c10_npu::NPUCachingAllocator::setShutdownStats(); + ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize."); c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize(); if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) { diff --git a/torch_npu/csrc/core/npu/NPUBlockHandle.h b/torch_npu/csrc/core/npu/NPUBlockHandle.h index 1641b3414c9..f2d98ead9db 100644 --- a/torch_npu/csrc/core/npu/NPUBlockHandle.h +++ b/torch_npu/csrc/core/npu/NPUBlockHandle.h @@ -32,5 +32,12 @@ C10_NPU_API void* GetBlockPtr(const void *handle); /// @param [in] handle: the block handle to query size /// @return size: the device memory size managed by block C10_NPU_API size_t GetBlockSize(const void *handle); + +/// @ingroup torch_npu +/// @brief Get device shut_down status of the block according to handle +/// @param [in] handle: the block handle to query device shut down status +/// @return size: true : means device is shutdown +/// false : means device is activate when query +C10_NPU_API bool GetDeviceShutDownStatusByHandle(const void *handle); } // namespace NPUCachingAllocator } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 87e3f96f474..2c601cf69f3 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -775,6 +775,9 @@ class DeviceCachingAllocator { bool set_fraction = false; + // whether shutdown + bool shutdown_stats = false; + public: DeviceCachingAllocator() : @@ -1079,6 +1082,18 @@ class DeviceCachingAllocator { release_cached_blocks(check_error); } + /** set device shut down status is true **/ + void setShutdownStats() { + std::lock_guard lock(mutex); + shutdown_stats = true; + } + + /** set device shut down status is true **/ + bool getShutdownStats() { + std::lock_guard lock(mutex); + return shutdown_stats; + } + /** Retrieves info (total size + largest block) of the memory cache **/ void cacheInfo(size_t* total, size_t* largest) { std::lock_guard lock(mutex); @@ -2155,6 +2170,13 @@ class NpuCachingAllocator : public NPUAllocator { { return "native"; } + + void setShutdownStats() { + auto count = device_allocator.size(); + for (size_t i = 0U; i < count; ++i) { + device_allocator[i]->setShutdownStats(); + } + } }; NpuCachingAllocator caching_allocator; @@ -2201,6 +2223,14 @@ size_t GetBlockSize(const void *handle) { return block->size; } +bool GetDeviceShutDownStatusByHandle(const void *handle) { + const Block *block = reinterpret_cast(handle); + AT_ASSERT(block); + caching_allocator.assertValidDevice(block->device); + AT_ASSERT(caching_allocator.device_allocator[block->device]); + return caching_allocator.device_allocator[block->device]->getShutdownStats(); +} + struct BackendStaticInitializer { BackendStaticInitializer() { diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index fe9d63061eb..35c57d351f7 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -136,6 +136,7 @@ public: virtual std::vector snapshot() = 0; virtual void FreeDeviceCachedMemory(int device) = 0; virtual std::string name() = 0; + virtual void setShutdownStats() = 0; }; // Allocator object, statically initialized @@ -234,5 +235,9 @@ inline std::string name() return get()->name(); } +void setShutdownStats() { + get()->setShutdownStats(); +} + } // namespace NPUCachingAllocator } // namespace c10_npu -- Gitee