diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 26603e28b73e5a362d267053ac188c3d3b238f46..6b26e9a207b75b4efd502c0f2843a2952fb89507 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1678,8 +1678,12 @@ public: size_t pid_num = torch_npu::ipc::getPids(&pids); NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemSetImportPid(handle.data, pids, pid_num)); ipc_handle_map[base_ptr] = handle; + ASCEND_LOGE("[IPC debug] aclrtIpcMemGetExportKey from base_ptr=%p, size=%d to get handle=%s.", + base_ptr, base_size, handle.data); } else { handle = it->second; + ASCEND_LOGE("[IPC debug] get from ipc_handle_map, base_ptr=%p, size=%d and handle=%s.", + base_ptr, base_size, handle.data); } ss.write((char*)&handle, ACL_IPC_HANDLE_SIZE); } else { @@ -1688,6 +1692,8 @@ public: SegmentRange(block->ptr, block->size), ss); offset = (char*)block->ptr - (char*)full_range.ptr; } + ASCEND_LOGE("[IPC debug] return from shareIpcHandle, handle=%s, offset=%d.", + ss.str().c_str(), offset); return ShareableHandle{offset, ss.str()}; } @@ -2841,6 +2847,7 @@ private: } aclrtFree((void *)block->ptr); + ASCEND_LOGE("[IPC debug] uncached_delete free ptr=%p.", (void *)block->ptr); total_allocated_memory -= block->size; auto *pool = block->pool; @@ -3127,6 +3134,7 @@ static void uncached_delete(void *ptr) c10_npu::npuSynchronizeDevice(false); } ASCEND_LOGD("Without NPUCachingAllocator, free by aclrtFree."); + ASCEND_LOGE("[IPC debug] uncached_delete free ptr=%p.", ptr); NPU_CHECK_ERROR(aclrtFree(ptr)); } @@ -3640,6 +3648,8 @@ public: ss.read(handle_r.data, ACL_IPC_HANDLE_SIZE); NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey(&npu_ipc_ptr_, handle_r.data, 0)); handle_s.assign(handle_r.data, ACL_IPC_HANDLE_SIZE); + ASCEND_LOGE("[IPC debug] getIpcDevPtr from aclrtIpcMemImportByKey, ptr=%p, handle=%s.", + npu_ipc_ptr_, handle_s.c_str()); } else if (type == SHAREABLE_NPU_EXPANDABLE_SEGMENT) { expandable_segment_ = ExpandableSegment::fromShared(device, ss) @@ -3659,7 +3669,11 @@ public: { if (npu_ipc_ptr_) { c10_npu::NPUGuard device_guard(device_); + c10_npu::npuSynchronizeDevice(true); + ASCEND_LOGE("[IPC debug] Succeed to npuSynchronizeDevice."); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(handle_s.c_str())); + ASCEND_LOGE("[IPC debug] receiver clear ptr=%p, handle=%s.", npu_ipc_ptr_, handle_s.c_str()); npu_ipc_ptr_ = nullptr; } if (expandable_segment_) { @@ -3691,6 +3705,7 @@ public: if (iter != ipcMemHandle_to_devptr.end()) { auto devptr = iter->second.wp_.lock(); TORCH_INTERNAL_ASSERT(devptr, "entry in cache has missing shared_ptr"); + ASCEND_LOGE("[IPC debug] getIpcDevPtr from ipcMemHandle_to_devptr, ptr=%p.", devptr.get()); return devptr; } int curr_device = 0; diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.cpp b/torch_npu/csrc/ipc/NPUIPCTypes.cpp index 00b28a9fc68f86047d1dd2d7c9304c339f31bc9c..549d172dc16dca6a84aa048186d63d022dab6dac 100644 --- a/torch_npu/csrc/ipc/NPUIPCTypes.cpp +++ b/torch_npu/csrc/ipc/NPUIPCTypes.cpp @@ -98,6 +98,7 @@ bool NpuIPCSentDataLimbo::collect() // Need to reset blocks out of the critical section here, otherwise it // deadlocks. for (auto& sd : reset_blocks) { + ASCEND_LOGE("[IPC debug] sender remove ptr=%p from NpuIPCSentDataLimbo_.", sd->original_ptr_.get()); sd.reset(); } return freed_memory; @@ -134,6 +135,7 @@ void NpuIPCSentDataDelete(void* ptr) } if (sent_data->counter_value() > 0) { npu_ipc_global_entities.NpuIPCSentDataLimbo_.add(std::move(sent_data)); + ASCEND_LOGE("[IPC debug] sender add ptr=%p into NpuIPCSentDataLimbo_.", ptr); } npu_ipc_global_entities.NpuIPCSentDataLimbo_.collect(); }