diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index cac3d571f72dfda78fac78e2e8261f1625c61b1e..de7a8ffeb2fb06d0299f915411a9d706610616c0 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1613,6 +1613,7 @@ void ProcessGroupHCCL::heartbeatMonitor() void ProcessGroupHCCL::hcclCommWatchdog() { c10_npu::SetThreadType(c10_npu::ThreadType::WATCHDOG_THREAD); + try { VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread started!"; if (monitorThreadEnabled_.load()) {