From 98929fdfbab8ec97dd59ac42ad6c04a6ec896457 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Wed, 26 Mar 2025 17:48:10 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=B5=81=E6=B0=B4=E7=BA=BF=E5=87=BA?= =?UTF-8?q?=E5=8C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- component/taskd/taskd/python/framework/worker/worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/component/taskd/taskd/python/framework/worker/worker.py b/component/taskd/taskd/python/framework/worker/worker.py index 28684f869..66aca048f 100644 --- a/component/taskd/taskd/python/framework/worker/worker.py +++ b/component/taskd/taskd/python/framework/worker/worker.py @@ -53,6 +53,7 @@ class Worker: result = start_monitor_client_func() if result == 0: run_log.info(f"Successfully start monitor client for rank:{self.rank}") + run_log.info(f"this is new package for 03 26:{self.rank}") return True run_log.warning(f"failed to start up monitor client with ret code:f{result}") return False -- Gitee From 2a8e84693fe95543c405c8ede0efb94431ed9561 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Mon, 31 Mar 2025 14:19:22 +0800 Subject: [PATCH 2/2] git --- .../taskd/python/framework/agent/ms_mgr/msrun_plugin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/component/taskd/taskd/python/framework/agent/ms_mgr/msrun_plugin.py b/component/taskd/taskd/python/framework/agent/ms_mgr/msrun_plugin.py index f7809d191..1b9cb8ac5 100644 --- a/component/taskd/taskd/python/framework/agent/ms_mgr/msrun_plugin.py +++ b/component/taskd/taskd/python/framework/agent/ms_mgr/msrun_plugin.py @@ -148,7 +148,7 @@ class MSRunPlugin: continue rank_id = fault_rank.get("RankId") status = fault_rank.get("Status") - run_log.debug( + run_log.info( f"status:{status},rankId:{rank_id},local:{local_worker_ranks}, {rank_id in local_worker_ranks}") if status == "fault" and rank_id in local_worker_ranks: fault_local_ranks.append(rank_id) @@ -184,7 +184,7 @@ class MSRunPlugin: time.sleep(self.monitor_interval) # After entering the loop, first obtain the process status once. ms_proc_status = monitor_func([-1]) - run_log.debug(f"nodeRank:{self.ms_node_rank} has got mindspore process status:{ms_proc_status}") + run_log.info(f"nodeRank:{self.ms_node_rank} has got mindspore process status:{ms_proc_status}") if not check_monitor_res_valid(ms_proc_status): run_log.warning(f"monitor not return a valid result, but {ms_proc_status}") continue @@ -195,7 +195,7 @@ class MSRunPlugin: # 进入循环后更新reset cm相关内容 self.update_reset_info() fault_status = self.get_fault_status() - run_log.debug(f"nodeRank:{self.ms_node_rank} fault status: is_fault:{fault_status.is_fault}," + run_log.info(f"nodeRank:{self.ms_node_rank} fault status: is_fault:{fault_status.is_fault}," f"is_unrecovered:{fault_status.is_unrecovered},is_retried:{fault_status.is_retried}," f"local_ranks:{fault_status.local_ranks}") -- Gitee