diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 42800ac6c0e736695445435f88f7aba6d8234a5a..e42e354a8c003dae43fca375a97ec5a11e8a7db4 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1388,7 +1388,7 @@ public: // Make sure taskqueue is empty, then execute release_cached_blocks c10_npu::npuSynchronizeDevice(true); } - c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true); + c10_npu::NPUWorkspaceAllocator::emptyCache(device, true); block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock)); } @@ -1744,7 +1744,7 @@ public: // Make sure event deque from taskqueue, then synchronize Event c10_npu::npuSynchronizeDevice(check_error); std::lock_guard lock(mutex); - c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, check_error); + c10_npu::NPUWorkspaceAllocator::emptyCache(device, check_error); release_cached_blocks(check_error, context); } diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index bf01df2a8d2009a8d08e9b609a89fb05bc285176..d599744c7ef4764d47da48d50425974588b04f0a 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -221,21 +221,9 @@ public: } // return to the system allocator - void empty_cache(bool need_empty_queue, bool check_error) + void empty_cache(bool check_error) { - if (need_empty_queue) { - ASCEND_LOGI("NPUWorkspaceAllocator empty_cache in main_thread."); - c10_npu::emptyAllNPUStream(check_error); - } else { - ASCEND_LOGI("NPUWorkspaceAllocator empty_cache in acl_thread."); - } - - auto acl_ret = c10_npu::acl::AclrtSynchronizeDeviceWithTimeout(); - if (check_error) { - NPU_CHECK_ERROR(acl_ret, "AclrtSynchronizeDeviceWithTimeout"); - } else { - NPU_CHECK_WARN(acl_ret); - } + ASCEND_LOGD("NPUWorkspaceAllocator begin empty cache with check_error = %d", check_error); for (const auto& block_pair : blocks) { if (block_pair.second->data_ptr != nullptr) { @@ -274,6 +262,7 @@ public: } blocks.clear(); + ASCEND_LOGD("NPUWorkspaceAllocator end empty cache with check_error = %d", check_error); } void record_history(bool enabled, CreateContextFn context_recorder, RecordContext when) @@ -431,13 +420,6 @@ public: auto src_ptr = static_cast(device_allocator[device]->getStreamPtr(stream)); *new_ptr = static_cast(device_allocator[device]->malloc(size, stream)); - // Free all cached blocks and try again. - if ((*new_ptr) == nullptr) { - device_allocator[device]->empty_cache(false, true); - c10_npu::NPUCachingAllocator::emptyCache(true); - *new_ptr = static_cast(device_allocator[device]->malloc(size, stream)); - } - if ((*new_ptr) == nullptr) { size_t device_free; size_t device_total; @@ -450,7 +432,8 @@ public: format_size(device_total), " total capacity; ", format_size(device_free), - " free)", + " free). If you want to reduce memory usage, ", + "take a try to set the environment variable TASK_QUEUE_ENABLE=1.\n" + PTA_ERROR(ErrCode::MEMORY)); } @@ -459,9 +442,9 @@ public: } } - void empty_cache(int device, bool need_empty_queue, bool check_error) + void empty_cache(int device, bool check_error) { - device_allocator[device]->empty_cache(need_empty_queue, check_error); + device_allocator[device]->empty_cache(check_error); allocated_ptrs.clear(); } @@ -604,9 +587,9 @@ c10::DataPtr malloc_with_stream(size_t size, aclrtStream stream) return workspace_allocator.allocate_with_stream(size, stream); } -void emptyCache(int device, bool need_empty_queue, bool check_error) +void emptyCache(int device, bool check_error) { - workspace_allocator.empty_cache(device, need_empty_queue, check_error); + workspace_allocator.empty_cache(device, check_error); } void recordHistory(bool enabled, CreateContextFn context_recorder, RecordContext when) diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h index 75c30236c3c7ea9ef21464345685d0524a424f83..75ad259d9c9989f5badd40142e3e2ad3ae20ce76 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h @@ -25,7 +25,7 @@ struct DeviceStats { c10::Allocator* get(); void init(); c10::DataPtr malloc_with_stream(size_t size, aclrtStream stream); -void emptyCache(int device, bool need_empty_queue, bool check_error = true); +void emptyCache(int device, bool check_error = true); void recordHistory(bool enabled, CreateContextFn context_recorder, RecordContext when); SnapshotInfo snapshot(); DeviceStats getDeviceStats(int device);