From 389e1f1e3ac44a52cdc701ba0870cf119a683116 Mon Sep 17 00:00:00 2001 From: wangchao Date: Sat, 23 Aug 2025 11:34:37 +0800 Subject: [PATCH] Unified log printing with failure scenarios of ranktable --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index e87d4138982..e7a9367cd85 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2303,7 +2303,7 @@ bool ProcessGroupHCCL::createHCCLCommEx( } auto comm = HCCLComm::createGlobalHcclComm(rankTableFile.c_str(), rank, commConfig); if (comm == nullptr) { - ASCEND_LOGI("Create global hccl comm with ranktable failed."); + ASCEND_LOGI("Create global hccl comm with ranktable failed, switch to original interface."); return false; } hcclComms[i] = comm; @@ -2330,11 +2330,11 @@ bool ProcessGroupHCCL::createHCCLCommEx( try { globalHcclComm = global_->getHcclCommByDevices(devices); } catch (const std::exception& e) { - ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s.", e.what()); + ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s, switch to original interface.", e.what()); return false; } if (!globalHcclComm) { - ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, globalHcclComm is nullptr."); + ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, globalHcclComm is nullptr, switch to original interface."); return false; } @@ -2374,7 +2374,7 @@ bool ProcessGroupHCCL::createHCCLCommEx( subComm = HCCLComm::createSubHcclComm(globalHcclComm, numRanks, options_->global_ranks_in_group.data(), hcclid, rank, commConfig); } if (subComm == nullptr) { - ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, group id is %s, subCommId is %llu, devicesKey is %s.", + ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, group id is %s, subCommId is %llu, devicesKey is %s, switch to original interface.", options_->group_id.c_str(), hcclid, devicesKey.c_str()); return false; } -- Gitee