From 1e1ba91b8366fc85119a3b177c4ee1c3d421cbca Mon Sep 17 00:00:00 2001 From: wugengjun <451676383@qq.com> Date: Mon, 9 Jun 2025 20:57:14 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=90=8C=E6=AD=A5=E6=9C=BA?= =?UTF-8?q?=E5=88=B6=EF=BC=8C=E4=BF=9D=E8=AF=81=E8=90=BD=E7=9B=98=E5=90=8E?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../mindspore/dump/cell_dump_process.py | 36 ++++++------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py index 44f8dbdfb49..2aa30a1085c 100644 --- a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py +++ b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py @@ -25,6 +25,7 @@ import numpy as np import pandas as pd import mindspore as ms from mindspore import nn, ops +from mindspore.ops.primitive import _run_op from msprobe.core.common.const import Const as CoreConst from msprobe.core.common.const import FileCheckConst @@ -559,27 +560,6 @@ def generate_stack_info(path): logger.info(f"Stack data saved to {json_path}") -def is_download_finished(directory, interval=3): - """ - 判断指定目录在一段时间后是否有数据被下载完成 - :param directory: 指定目录的路径 - :param interval: 检查的时间间隔(秒),默认为 3 秒 - :return: 如有数据被下载完成返回 True,否则返回 False - """ - # 检查目录是否存在 - if not os.path.exists(directory): - logger.warning(f"The specified directory {directory} does not exist.") - return False - initial_modification_time = os.path.getmtime(directory) - time.sleep(interval) - current_modification_time = os.path.getmtime(directory) - # 比较初始和当前修改时间 - if current_modification_time > initial_modification_time: - return False - else: - return True - - def process(dump_path): rank_id = os.environ.get('RANK_ID') rank_dir = DEFAULT_RANK_DIR @@ -591,12 +571,16 @@ def process(dump_path): step_path = os.path.join(dump_path, step_dir) rank_path = os.path.join(step_path, rank_dir) npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA) + + # 通过最后下发一个小tensor的dump,判断是否落盘结束 + temp_tensor = ms.Tensor([1], dtype=ms.float32) + dump_finished_flag = ".npy" + flag_file_name = os.path.join(npy_path, dump_finished_flag) + _run_op(ops.TensorDump(), "TensorDump", (flag_file_name, temp_tensor)) + while True: - is_finished = is_download_finished(npy_path) - if not is_finished: - logger.info("There is data being downloaded in the specified directory, continue checking...") - else: - logger.info("There is no data being downloaded in the specified directory, Stop checking.") + if os.path.exists(flag_file_name): + remove_path(flag_file_name) break logger.info("==========Start processing data that has already been stored on the disk!==========") rename_filename(path=npy_path) -- Gitee