From 59a55f0ef3684fb88cd3e4a87122ccde768a89df Mon Sep 17 00:00:00 2001
From: huyuanquan1 <huyuanquan1@huawei.com>
Date: Sat, 16 Aug 2025 16:46:59 +0800
Subject: [PATCH] [Feat]add light-weight version of empty_cache()

---
 test/torch_npu_schema.json                    |  6 ++
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 74 +++++++++++++------
 torch_npu/csrc/core/npu/NPUCachingAllocator.h |  6 +-
 torch_npu/csrc/npu/Module.cpp                 | 11 ++-
 torch_npu/csrc/npu/NPUPluggableAllocator.cpp  |  6 +-
 torch_npu/csrc/npu/NPUPluggableAllocator.h    |  6 +-
 torch_npu/npu/__init__.py                     |  1 +
 torch_npu/npu/memory.py                       |  9 +++
 8 files changed, 85 insertions(+), 34 deletions(-)
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index ce55facef82..56be993b93d 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -1076,6 +1076,9 @@
   "torch_npu.npu.empty_cache": {
     "signature": "()"
   },
+  "torch_npu.npu.empty_virt_addr_cache": {
+    "signature": "()"
+  },
   "torch_npu.npu.enable_deterministic_with_backward": {
     "signature": "(tensor: torch.Tensor)"
   },
@@ -1418,6 +1421,9 @@
   "torch_npu.npu.memory.empty_cache": {
     "signature": "()"
   },
+  "torch_npu.npu.memory.empty_virt_addr_cache": {
+    "signature": "()"
+  },
   "torch_npu.npu.memory.get_allocator_backend": {
     "signature": "() -> str"
   },
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index e42e354a8c0..a8a84ca520e 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -188,6 +188,8 @@ struct BlockPool {
     std::set<Block *, Comparison> unmapped;
     const bool is_small;
     PrivatePool *owner_PrivatePool;
+    // store unmapped handles
+    std::vector<aclrtDrvMemHandle> free_physical_handles_;
 
     BlockPool(bool small, PrivatePool *private_pool = nullptr)
         : blocks(BlockComparatorSize),
@@ -404,7 +406,7 @@ struct ExpandableSegment {
     // returns the actual range mapped, which may be
     // greater than requested if size is not aligned to segment_size_.
     // return size of 0 indicates OOM
-    SegmentRange map(SegmentRange range)
+    SegmentRange map(SegmentRange range, BlockPool *pool)
     {
         auto begin = segmentLeft(range.ptr);
         auto end = segmentRight(range.ptr + range.size);
@@ -418,6 +420,13 @@ struct ExpandableSegment {
         for (auto i : c10::irange(begin, end)) {
             TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::VALUE));
             aclrtDrvMemHandle handle = nullptr;
+            if (!pool->free_physical_handles_.empty()) {
+                ASCEND_LOGD("Remap cached physical handles for block %zu", i);
+                handle = pool->free_physical_handles_.back();
+                pool->free_physical_handles_.pop_back();
+                handles_.at(i) = Handle{handle, std::nullopt};
+                continue;
+            }
             aclrtPhysicalMemProp prop = {};
             prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
             prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
@@ -425,6 +434,7 @@ struct ExpandableSegment {
             prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
             prop.location.id = static_cast<unsigned>(device_);
             prop.reserve = 0;
+            ASCEND_LOGD("Alloc memory from physical device for block %zu", i);
             auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
             if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
                 for (auto j : c10::irange(begin, i)) {
@@ -449,14 +459,14 @@ struct ExpandableSegment {
     // unmaps all the completely empty segment_size_ segments between
     // [begin, begin + size), returns the offset where the range begin,
     // and the actual size unmapped (multiple of segment_size_)
-    SegmentRange unmap(SegmentRange range)
+    SegmentRange unmap(SegmentRange range, BlockPool *pool)
     {
         auto begin = segmentRight(range.ptr);
         auto end = segmentLeft(range.ptr + range.size);
         if (begin >= end) {
             return SegmentRange{ range.ptr, 0 };
         }
-        unmapHandles(begin, end);
+        unmapHandles(begin, end, pool);
         return rangeFromHandles(begin, end);
     }
 
@@ -563,7 +573,7 @@ private:
         ASCEND_LOGD("NPUCachingAllocator mapAndSetAccess: segment_size=%zu", segment_size_);
     }
 
-    void unmapHandles(size_t begin, size_t end)
+    void unmapHandles(size_t begin, size_t end, BlockPool *pool = nullptr)
     {
         // note: unlike aclrtFree, MemUnmap and MemRelease do
         // not appear to synchronize in all cases, so we have to wait for the
@@ -595,7 +605,11 @@ private:
                     continue;
                 }
             }
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle));
+            if (!pool) {
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle));
+            } else {
+                pool->free_physical_handles_.push_back(h.handle);
+            }
         }
         ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
         trimHandles();
@@ -1389,7 +1403,7 @@ public:
                 c10_npu::npuSynchronizeDevice(true);
             }
             c10_npu::NPUWorkspaceAllocator::emptyCache(device, true);
-            block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
+            block_found = (release_cached_blocks(true, context, true) && alloc_block(params, true, context, lock));
         }
 
         if (!block_found) {
@@ -1738,14 +1752,14 @@ public:
     }
 
     /* * returns cached blocks to the system allocator * */
-    void emptyCache(int device, bool check_error)
+    void emptyCache(int device, bool check_error, bool free_physical)
     {
         std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
         // Make sure event deque from taskqueue, then synchronize Event
         c10_npu::npuSynchronizeDevice(check_error);
         std::lock_guard<std::recursive_mutex> lock(mutex);
         c10_npu::NPUWorkspaceAllocator::emptyCache(device, check_error);
-        release_cached_blocks(check_error, context);
+        release_cached_blocks(check_error, context, free_physical);
     }
 
     void buildServerMemMapForHccl(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
@@ -2319,12 +2333,12 @@ private:
         return candidate;
     }
 
-    bool map_block(Block *to_map, size_t size, const std::shared_ptr<c10::GatheredContext> &ctx)
+    bool map_block(Block *to_map, size_t size, const std::shared_ptr<c10::GatheredContext> &ctx, BlockPool *map_pool)
     {
         TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::VALUE));
         TORCH_INTERNAL_ASSERT(!to_map->context_when_allocated); // unmapped blocks should not keep
                                                                 // history
-        auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size });
+        auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size }, map_pool);
         // failed to map the memory
         if (mapped_range.size == 0) {
             return false;
@@ -2375,7 +2389,7 @@ private:
         // unmapped -> free -> *
         // free -> unmapped -> *
 
-        if (!candidate->mapped && !map_block(candidate, std::min(candidate->size, size), ctx)) {
+        if (!candidate->mapped && !map_block(candidate, std::min(candidate->size, size), ctx, pool)) {
             return nullptr;
         }
         TORCH_INTERNAL_ASSERT(candidate->mapped, PTA_ERROR(ErrCode::INTERNAL));
@@ -2388,7 +2402,7 @@ private:
             if (C10_UNLIKELY(new_candidate == nullptr)) {
                 return nullptr;
             }
-            if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) {
+            if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx, pool)) {
                 return nullptr;
             }
             candidate = new_candidate;
@@ -2801,21 +2815,21 @@ private:
     }
 
     // npuSynchronizeDevice must be executed before this function can be called
-    bool release_cached_blocks(bool check_error, const std::shared_ptr<c10::GatheredContext> &context)
+    bool release_cached_blocks(bool check_error, const std::shared_ptr<c10::GatheredContext> &context, bool free_physical)
     {
         // First ensure that all blocks that can't currently be allocated due to
         // outstanding events are returned to the pool.
         synchronize_and_free_events(check_error, context);
 
         // Free all non-split cached blocks
-        release_blocks(large_blocks, context);
-        release_blocks(small_blocks, context);
+        release_blocks(large_blocks, context, free_physical);
+        release_blocks(small_blocks, context, free_physical);
 
         for (auto it = graph_pools_freeable.begin(); it != graph_pools_freeable.end();) {
             // See notifyCaptureDestroy for the strategy here.
             TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
-            release_blocks(it->second->small_blocks, context);
-            release_blocks(it->second->large_blocks, context);
+            release_blocks(it->second->small_blocks, context, free_physical);
+            release_blocks(it->second->large_blocks, context, free_physical);
             if (it->second->npuMalloc_count == 0) {
                 auto erase_count = graph_pools.erase(it->first);
                 TORCH_INTERNAL_ASSERT(erase_count == 1);
@@ -2883,9 +2897,10 @@ private:
         block = nullptr;
     }
 
-    void unmap_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context)
+    void unmap_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context, bool free_physical)
     {
-        auto unmapped = block->expandable_segment_->unmap(SegmentRange{ block->ptr, block->size });
+        auto pool = free_physical ? nullptr : block->pool;
+        auto unmapped = block->expandable_segment_->unmap(SegmentRange{ block->ptr, block->size }, pool);
         if (unmapped.size == 0) {
             return;
         }
@@ -2934,7 +2949,7 @@ private:
             context ? context : block->context_when_segment_allocated);
     }
 
-    void release_blocks(BlockPool &pool, const std::shared_ptr<c10::GatheredContext> &context)
+    void release_blocks(BlockPool &pool, const std::shared_ptr<c10::GatheredContext> &context, bool free_physical)
     {
         std::vector<Block *> to_unmap;
         // Frees all non-split blocks
@@ -2952,11 +2967,19 @@ private:
             }
         }
         for (Block *block : to_unmap) {
-            unmap_block(block, context);
+            unmap_block(block, context, free_physical);
             if (!block->prev && !block->next) {
                 release_expandable_segment(block);
             }
         }
+        // free cached physical handles
+        if (free_physical) {
+            while (!pool.free_physical_handles_.empty()) {
+                aclrtDrvMemHandle handle = pool.free_physical_handles_.back();
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(handle));
+                pool.free_physical_handles_.pop_back();
+            }
+        }
     }
 
     EventPool::Event create_event_internal(int idx)
@@ -3315,8 +3338,11 @@ public:
         }
     }
 
-    void emptyCache(bool check_error) override
+    void emptyCache(bool check_error, bool free_physical) override
     {
+        if (!free_physical && !CachingAllocatorConfig::expandable_segments()) {
+            AT_ERROR("Unsupported config for empty_virt_addr_cache, please enable expandable_segments.")
+        }
         ASCEND_LOGD("Begin empty cache with check_error = %d", check_error);
         int32_t current_device = 0;
         if (check_error) {
@@ -3331,7 +3357,7 @@ public:
             } else {
                 NPU_CHECK_WARN(c10_npu::SetDevice(device_idx));
             }
-            device_allocator[device_idx]->emptyCache(device_idx, check_error);
+            device_allocator[device_idx]->emptyCache(device_idx, check_error, free_physical);
         }
         if (check_error) {
             NPU_CHECK_ERROR(c10_npu::MaybeSetDevice(current_device));
@@ -3731,7 +3757,7 @@ public:
 
     void FreeDeviceCachedMemory(int device) override
     {
-        device_allocator[device]->emptyCache(device, true);
+        device_allocator[device]->emptyCache(device, true, true);
     }
 
     std::string name() override
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index 13c68aa0e3f..f1deb6a1a2b 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -202,7 +202,7 @@ public:
     virtual void init(int device_count) = 0;
     virtual bool initialized() = 0;
     virtual void setMemoryFraction(double fraction, int device) = 0;
-    virtual void emptyCache(bool check_error) = 0;
+    virtual void emptyCache(bool check_error, bool free_physical) = 0;
     virtual void clearIpcHandles() = 0;
     virtual void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) = 0;
     virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
@@ -306,9 +306,9 @@ inline void setMemoryFraction(double fraction, int device)
     return get()->setMemoryFraction(fraction, device);
 }
 
-C10_NPU_API inline void emptyCache(bool check_error = true)
+C10_NPU_API inline void emptyCache(bool check_error = true, bool free_physical)
 {
-    return get()->emptyCache(check_error);
+    return get()->emptyCache(check_error, free_physical);
 }
 
 inline void clearIpcHandles()
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index c66f2a8d523..e6361be59b1 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -276,7 +276,7 @@ void RegisterNpuPluggableAllocator(PyObject* module)
         "set_reset_fn",
         [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self,
             uint64_t func_ptr) {
-            using FuncType = void(bool);
+            using FuncType = void(bool, bool);
             std::function<FuncType> func =
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_reset_fn(func);
@@ -1017,6 +1017,14 @@ PyObject* THNPModule_emptyCache(PyObject *_unused, PyObject *noargs)
     Py_RETURN_NONE;
 }
 
+PyObject* THNPModule_emptyVirtAddrCache(PyObject *_unused, PyObject *noargs)
+{
+    HANDLE_TH_ERRORS
+    c10_npu::NPUCachingAllocator::emptyCache(true, false);
+    END_HANDLE_TH_ERRORS
+    Py_RETURN_NONE;
+}
+
 PyObject* THNPModule_memoryStats(PyObject *_unused, PyObject *arg)
 {
     HANDLE_TH_ERRORS
@@ -1860,6 +1868,7 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_is_jit_compile_false", (PyCFunction)THNPModule_is_jit_compile_false_wrap, METH_NOARGS, nullptr},
     {"_npu_setMemoryFraction", (PyCFunction) THNPModule_setMemoryFraction, METH_VARARGS, nullptr},
     {"_npu_emptyCache", (PyCFunction) THNPModule_emptyCache, METH_NOARGS, nullptr},
+    {"_npu_emptyVirtAddrCache", (PyCFunction) THNPModule_emptyVirtAddrCache, METH_NOARGS, nullptr},
     {"_npu_memoryStats", (PyCFunction) THNPModule_memoryStats, METH_O, nullptr},
     {"_npu_resetAccumulatedMemoryStats", (PyCFunction) THNPModule_resetAccumulatedMemoryStats, METH_O, nullptr},
     {"_npu_resetPeakMemoryStats", (PyCFunction) THNPModule_resetPeakMemoryStats, METH_O,  nullptr},
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
index 7610374a3ba..cf24bdbc25f 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
@@ -45,7 +45,7 @@ void NPUPluggableAllocator::set_init_fn(std::function<void(int)> init_fn)
     init_fn_ = std::move(init_fn);
 }
 
-void NPUPluggableAllocator::set_reset_fn(std::function<void(bool)> reset_fn)
+void NPUPluggableAllocator::set_reset_fn(std::function<void(bool, bool)> reset_fn)
 {
     reset_fn_ = std::move(reset_fn);
 }
@@ -182,10 +182,10 @@ void NPUPluggableAllocator::setMemoryFraction(double fraction, int device)
     }
 }
 
-void NPUPluggableAllocator::emptyCache(bool check_error)
+void NPUPluggableAllocator::emptyCache(bool check_error, bool free_physical)
 {
     if (reset_fn_) {
-        return reset_fn_(check_error);
+        return reset_fn_(check_error, free_physical);
     }
 }
 
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h
index 266db02a604..d40087eb750 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.h
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h
@@ -37,7 +37,7 @@ struct NPUPluggableAllocator
     NPUPluggableAllocator(NPUPluggableAllocator& other);
 
     void set_init_fn(std::function<void(int)> init_fn);
-    void set_reset_fn(std::function<void(bool)> reset_fn);
+    void set_reset_fn(std::function<void(bool, bool)> reset_fn);
     void set_memory_fraction_fn(
         std::function<void(double, int)> memory_fraction_fn);
     void set_base_alloc_fn(std::function<void*(void*, size_t*)> base_alloc_fn);
@@ -59,7 +59,7 @@ struct NPUPluggableAllocator
     void init(int device_count) override;
     bool initialized() override;
     void setMemoryFraction(double fraction, int device) override;
-    void emptyCache(bool check_error) override;
+    void emptyCache(bool check_error, bool free_physical) override;
     void clearIpcHandles() override;
     void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) override;
     void* getBaseAllocation(void* ptr, size_t* size) override;
@@ -108,7 +108,7 @@ protected:
     std::function<void*(size_t, int, aclrtStream)> alloc_fn_;
     std::function<void(void*, size_t, int, aclrtStream)> free_fn_;
     std::function<void(int)> init_fn_;
-    std::function<void(bool)> reset_fn_;
+    std::function<void(bool, bool)> reset_fn_;
     std::function<void(double, int)> memory_fraction_fn_;
     std::function<void*(void*, size_t*)> base_alloc_fn_;
     std::function<void(void* ptr, c10_npu::NPUStream stream)> record_stream_fn_;
diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py
index 75bf03d13a2..33db3e1fa7e 100644
--- a/torch_npu/npu/__init__.py
+++ b/torch_npu/npu/__init__.py
@@ -33,6 +33,7 @@ __all__ = [
     "caching_allocator_delete",
     "set_per_process_memory_fraction",
     "empty_cache",
+    "empty_virt_addr_cache",
     "memory_stats",
     "memory_stats_as_nested_dict",
     "reset_accumulated_memory_stats",
diff --git a/torch_npu/npu/memory.py b/torch_npu/npu/memory.py
index 74475787829..ad5c8160438 100644
--- a/torch_npu/npu/memory.py
+++ b/torch_npu/npu/memory.py
@@ -20,6 +20,7 @@ __all__ = [
     "caching_allocator_delete",
     "set_per_process_memory_fraction",
     "empty_cache",
+    "empty_virt_addr_cache",
     "memory_stats",
     "memory_stats_as_nested_dict",
     "reset_accumulated_memory_stats",
@@ -158,6 +159,14 @@ def empty_cache():
         torch_npu._C._npu_emptyCache()
 
 
+def empty_virt_addr_cache():
+    r"""Light-weight version of empty_cache(). It only unmaps virtual address,
+    and store the free physical handles for later malloc.
+    """
+    if is_initialized():
+        torch_npu._C._npu_emptyVirtAddrCache()
+
+
 def memory_stats(device=None):
     """Returns a dictionary of NPU memory allocator statistics for a
     given device.
-- 
Gitee