diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 46ec61a5fa5f3e804a46af38a0b23acad1fa8496..d21e94fd3a6b5e3b738367f3f45ae8e5f7047167 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -990,8 +990,9 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( if (rank_ == 0) { HCCL_CHECK_ERROR(HcclGetRootInfo(&hcclID)); } + ASCEND_LOGE(" start broadcast master id"); broadcastMasterID(&hcclID); - + ASCEND_LOGE(" end broadcast master id"); c10_npu::OptionalNPUGuard npuGuard; std::vector streamVal; streamVal.reserve(devices.size()); @@ -1001,12 +1002,13 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( int rank = getRank() * devices.size() + i; npuGuard.set_index(devices[i].index()); + ASCEND_LOGE(" start create hccl communication, rank %d", rank); hcclComms[i] = HCCLComm::create(numRanks, rank, hcclID); - + ASCEND_LOGE(" end create hccl communication, rank %d", rank); // Creates the HCCL streams streamVal.push_back(c10_npu::getNPUStreamFromPool(devices[i].index())); } - + ASCEND_LOGE(" end hccl communication"); hcclStreams_.emplace(devicesKey, std::move(streamVal)); // Note: these events are created with the (default) cudaEventDisableTiming