From 32f6a638642746f53d444a54071a240f6d73556c Mon Sep 17 00:00:00 2001 From: GuoGuanghao Date: Tue, 15 Jul 2025 09:52:34 +0800 Subject: [PATCH] bugfix for resumehcclcomm --- .../csrc/distributed/ProcessGroupHCCL.cpp | 32 ++++--------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 3a03ea4111..a2ec17c518 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2753,32 +2753,14 @@ int64_t ProcessGroupHCCL::getHcclComm(int rankid) void ProcessGroupHCCL::resumeHcclComm(int device_id) { - at::Device device = at::Device(c10::DeviceType::PrivateUse1, device_id); - std::vector devices = {device}; - auto key = getKeyFromDevices(devices); - - { - std::lock_guard lock(mutex_); - if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) { - // Reuse the cached communicator if there is one. - auto& hcclComms = devHCCLCommMap_[key]; - for (const auto& hcclComm : hcclComms) { - auto comm = hcclComm->getHcclComm(); - HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); - } - } - if (hcclCommInitRootInfoConfigExist() && c10_npu::option::OptionsManager::GetP2PBufferSize() != 0) { - key = getKeySendRecv(rank_, getP2pPeer()); - if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) { - // Reuse the cached communicator if there is one. - auto& hcclComms = devHCCLCommMap_[key]; - for (const auto& hcclComm : hcclComms) { - auto comm = hcclComm->getHcclComm(); - HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); - } - } + std::lock_guard lock(mutex_); + for (const auto& devHCCLCommMap : devHCCLCommMap_) { + auto& hcclComms = devHCCLCommMap.second; + for (const auto& hcclComm : hcclComms) { + auto comm = hcclComm->getHcclComm(); + HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); } - } + } ASCEND_LOGI("resumeHcclComm success, group id is %s.", options_->group_id.c_str()); } -- Gitee