diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 3a03ea411140e856d32d65d7918ee38bcca7a8b9..a2ec17c51876fb6490d1b02716e93f3eb1d7240b 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2753,32 +2753,14 @@ int64_t ProcessGroupHCCL::getHcclComm(int rankid) void ProcessGroupHCCL::resumeHcclComm(int device_id) { - at::Device device = at::Device(c10::DeviceType::PrivateUse1, device_id); - std::vector devices = {device}; - auto key = getKeyFromDevices(devices); - - { - std::lock_guard lock(mutex_); - if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) { - // Reuse the cached communicator if there is one. - auto& hcclComms = devHCCLCommMap_[key]; - for (const auto& hcclComm : hcclComms) { - auto comm = hcclComm->getHcclComm(); - HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); - } - } - if (hcclCommInitRootInfoConfigExist() && c10_npu::option::OptionsManager::GetP2PBufferSize() != 0) { - key = getKeySendRecv(rank_, getP2pPeer()); - if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) { - // Reuse the cached communicator if there is one. - auto& hcclComms = devHCCLCommMap_[key]; - for (const auto& hcclComm : hcclComms) { - auto comm = hcclComm->getHcclComm(); - HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); - } - } + std::lock_guard lock(mutex_); + for (const auto& devHCCLCommMap : devHCCLCommMap_) { + auto& hcclComms = devHCCLCommMap.second; + for (const auto& hcclComm : hcclComms) { + auto comm = hcclComm->getHcclComm(); + HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); } - } + } ASCEND_LOGI("resumeHcclComm success, group id is %s.", options_->group_id.c_str()); }