diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 46ec61a5fa5f3e804a46af38a0b23acad1fa8496..d21e94fd3a6b5e3b738367f3f45ae8e5f7047167 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -990,8 +990,9 @@ std::vector<std::shared_ptr<HCCLComm>>& ProcessGroupHCCL::getHCCLComm(
   if (rank_ == 0) {
     HCCL_CHECK_ERROR(HcclGetRootInfo(&hcclID));
   }
+  ASCEND_LOGE("<HCCLInit> start broadcast master id");
   broadcastMasterID(&hcclID);
-
+  ASCEND_LOGE("<HCCLInit> end broadcast master id");
   c10_npu::OptionalNPUGuard npuGuard;
   std::vector<c10_npu::NPUStream> streamVal;
   streamVal.reserve(devices.size());
@@ -1001,12 +1002,13 @@ std::vector<std::shared_ptr<HCCLComm>>& ProcessGroupHCCL::getHCCLComm(
     int rank = getRank() * devices.size() + i;
 
     npuGuard.set_index(devices[i].index());
+    ASCEND_LOGE("<HCCLInit> start create hccl communication, rank %d", rank);
     hcclComms[i] = HCCLComm::create(numRanks, rank, hcclID);
-
+    ASCEND_LOGE("<HCCLInit> end create hccl communication, rank %d", rank);
     // Creates the HCCL streams
     streamVal.push_back(c10_npu::getNPUStreamFromPool(devices[i].index()));
   }
-
+  ASCEND_LOGE("<HCCLInit> end hccl communication");
   hcclStreams_.emplace(devicesKey, std::move(streamVal));
 
   // Note: these events are created with the (default) cudaEventDisableTiming