From f40ed9cfe095a3291704c7327437e6d54891329b Mon Sep 17 00:00:00 2001
From: wugengjun <451676383@qq.com>
Date: Wed, 22 Jan 2025 21:04:56 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E9=9D=99=E6=80=81=E5=9B=BEcell=E7=BA=A7dum?=
=?UTF-8?q?p?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../msprobe/core/common/const.py | 5 +
.../msprobe/docs/02.config_introduction.md | 20 +-
.../msprobe/docs/06.data_dump_MindSpore.md | 62 ++-
.../msprobe/mindspore/common/const.py | 1 +
.../mindspore/debugger/precision_debugger.py | 6 +-
.../mindspore/dump/cell_dump_process.py | 450 ++++++++++++++++++
.../mindspore/dump/dump_tool_factory.py | 17 +-
.../mindspore/dump/graph_mode_cell_dump.py | 69 +++
.../msprobe/mindspore/task_handler_factory.py | 7 +-
.../debugger/test_graph_cell_dump.py | 309 ++++++++++++
10 files changed, 922 insertions(+), 24 deletions(-)
create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py
create mode 100644 debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py
diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py
index b49b4fffd5..6824fc8b42 100644
--- a/debug/accuracy_tools/msprobe/core/common/const.py
+++ b/debug/accuracy_tools/msprobe/core/common/const.py
@@ -206,12 +206,14 @@ class Const:
TORCH_FLOAT32 = "torch.float32"
TORCH_BFLOAT16 = "torch.bfloat16"
+ TYPE = 'type'
DTYPE = 'dtype'
SHAPE = 'shape'
MAX = 'Max'
MIN = 'Min'
MEAN = 'Mean'
NORM = 'Norm'
+ DATA_NAME = 'data_name'
CODE_STACK = 'Code Stack'
OP_NAME = 'Op Name'
@@ -224,6 +226,9 @@ class Const:
SCOPE_SEPARATOR = "/"
REPLACEMENT_CHARACTER = "_"
+ FORWARD_PATTERN = SEP + FORWARD + SEP
+ BACKWARD_PATTERN = SEP + BACKWARD + SEP
+
OPTIMIZER = "optimizer"
CLIP_GRAD = "clip_grad"
END_PREFIX = "end_"
diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index f134bd4536..5b2e6d5027 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -10,19 +10,19 @@
### 1.1 通用配置
-| 参数 | 解释 | 是否必选 |
-| ----------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
-| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对;
"grad_probe":梯度监控;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 |
-| dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 |
-| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 |
-| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 |
-| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,仅 PyTorch 与 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore场景详细介绍见 [MindSpore 场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch 与 MindSpore 动态图场景支持。
"debug":单点保存功能,细节详见[单点保存工具 README](./28.debugger_save_instruction.md)
**配置示例**:"level": "L1"。 | 否 |
-| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 |
+| 参数 | 解释 | 是否必选 |
+| ----------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
+| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对;
"grad_probe":梯度监控。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 |
+| dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 |
+| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。静态图 L0 级别 dump 暂不支持指定rank。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 |
+| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 |
+| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,PyTorch 与 MindSpore 均支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch 与 MindSpore 动态图场景支持。
**配置示例**:"level": "L1"。 | 否 |
+| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 |
| async_dump | 异步 dump 开关,bool 类型。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor
的统计量计算。 | 否 |
#### 1.1.1 模块级精度数据 dump 说明
-仅 PyTorch 与 MindSpore 动态图场景支持。
+PyTorch 与 MindSpore 均支持。
大模型场景下,通常不是简单的利用自动迁移能力实现从 GPU 到 NPU 的训练脚本迁移,而是会对 NPU 网络进行一系列针对性的适配,因此,常常会造成迁移后的 NPU 模型存在部分子结构不能与 GPU 原始模型完全对应。模型结构不一致导致 API 调用类型及数量不一致,若直接按照 API 粒度进行精度数据 dump 和比对,则无法完全比对所有的 API。
@@ -46,7 +46,7 @@
MindSpore 静态图场景配置 kernel_name,可以是算子的名称列表,也可以指定算子类型("level": "L2"时不支持),还可以配置算子名称的正则表达式(当字符串符合“name-regex(xxx)”格式时,后台则会将其作为正则表达式。 配置示例:list: ["name-regex(Default/.+)"] 可匹配算子名称以“Default/”开头的所有算子。 |
data_mode | dump 数据过滤,str 类型。 | 否 |
PyTorch 与 MindSpore 动态图场景:支持"all"、"forward"、"backward"、"input"和"output",除"all"外,其余参数可以自由组合。默认为["all"],即保存所有 dump 的数据。 配置示例:"data_mode": ["backward"] (仅保存反向数据)或 "data_mode": ["forward", "input"](仅保存前向的输入数据)。 |
- MindSpore 静态图场景:仅支持"all"、"input"和"output"参数,且各参数只能单独配置,不支持自由组合。 配置示例:"data_mode": ["all"]。 |
+ MindSpore 静态图场景:L0 级别 dump 仅支持"all"、"forward"和"backward"参数;L2 级别 dump 仅支持"all"、"input"和"output"参数。且各参数只能单独配置,不支持自由组合。 配置示例:"data_mode": ["all"]。 |
summary_mode | 控制 dump 文件输出的模式,str 类型,仅 PyTorch 与 MindSpore 动态图场景支持,可选参数: md5:dump 输出包含 CRC-32 值以及 API 统计信息的 dump.json 文件,用于验证数据的完整性; statistics:dump 仅输出包含 API 统计信息的 dump.json 文件,默认值。 配置示例:"summary_mode": "md5"。 | 否 |
MindSpore静态图jit_level=O2场景L2级dump,支持上述配置的同时额外支持配置统计项列表,可选统计项为max、min、mean、l2norm,可从中任意选取组合搭配。其中mean、l2norm的结果为float数据格式。 配置示例:"summary_mode": ["max", "min"]。 |
diff --git a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
index f7507facd2..0ee33b44a8 100644
--- a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
+++ b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
@@ -30,8 +30,10 @@ dump 的"tensor"模式采集数据量大小,可以参考[数据量基线](data
## 5. 场景介绍
-### 5.1 静态图场景
-在静态图场景下,msprobe 仅支持 **L2 Level** 的数据采集。
+### 5.1 静态图场景
+在静态图场景下,msprobe 支持 **L0 Level** 和 **L2 Level** 的数据采集。
+- **L0 Level(Cell 级)** :采集 `Cell` 对象的数据,适用于需要分析特定网络模块的情况。
+
- **L2 Level(Kernel 级)** :采集底层算子的输入输出数据,适用于深入分析算子级别的精度问题。
采集方式请参见[示例代码 > 静态图场景](#71-静态图场景)。详细介绍请参见[《config.json 配置文件介绍》](./02.config_introduction.md#11-通用配置)中的“level 参数”和[《config.json 配置示例》](./03.config_examples.md#2-mindspore-静态图场景) 中的“MindSpore 静态图场景”。
@@ -110,7 +112,7 @@ stop()
**功能说明**:结束一个 step 的数据采集,完成所有数据落盘并更新 dump 参数。在一个 step 结束的位置添加,且必须在 **stop** 函数之后的位置调用。
该函数需要配合 **start** 和 **stop** 函数使用,尽量添加在反向计算代码之后,否则可能会导致反向数据丢失。
-**仅未使用 Model 高阶 API 的动态图场景支持。**
+**仅未使用 Model 高阶 API 的动态图和静态图场景支持。**
**原型**:
@@ -152,7 +154,7 @@ save(variable, name, save_backward=True)
### 6.2 msprobe.mindspore.common.utils.MsprobeStep
-**功能说明**:MindSpore Callback类,自动在每个step开始时调用start()接口,在每个step结束时调用stop()、step()接口。实现使用 Model 高阶 API 的动态图场景下 L0、L1、mix 级别的精度数据采集控制,控制粒度为单个 **Step** ,而 PrecisionDebugger.start, PrecisionDebugger.stop 接口的控制粒度任意训练代码段。
+**功能说明**:MindSpore Callback类,自动在每个step开始时调用start()接口,在每个step结束时调用stop()、step()接口。实现使用 Model 高阶 API 的动态图场景下 L0、L1、mix 级别,和静态图场景下 L0级别的精度数据采集控制,控制粒度为单个 **Step** ,而 PrecisionDebugger.start, PrecisionDebugger.stop 接口的控制粒度任意训练代码段。
**原型**:
@@ -188,6 +190,54 @@ seed_all(seed=1234, mode=False, rm_dropout=True)
### 7.1 静态图场景
+#### 7.1.1 L0 级别
+
+##### 7.1.1.1 未使用 Model 高阶 API
+
+
+```python
+import mindspore as ms
+ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
+
+from msprobe.mindspore import PrecisionDebugger
+debugger = PrecisionDebugger(config_path="./config.json")
+
+# 模型、损失函数的定义以及初始化等操作
+# ...
+model = Network()
+# 数据集迭代的地方往往是模型开始训练的地方
+for data, label in data_loader:
+ debugger.start(model) # 进行 L0 级别下Cell 对象的数据采集时调用
+ # 如下是模型每个 step 执行的逻辑
+ grad_net = ms.grad(model)(data)
+ # ...
+ debugger.step() # 更新迭代数
+```
+
+##### 7.1.1.2 使用 Model 高阶 API
+
+
+```python
+import mindspore as ms
+from mindspore.train import Model
+ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
+
+from msprobe.mindspore import PrecisionDebugger
+from msprobe.mindspore.common.utils import MsprobeStep
+debugger = PrecisionDebugger(config_path="./config.json")
+
+# 模型、损失函数的定义以及初始化等操作
+# ...
+
+model = Network()
+# 进行 L0 级别下 Cell 对象的数据采集时调用
+debugger.start(model)
+trainer = Model(model, loss_fn=loss_fn, optimizer=optimizer, metrics={'accuracy'})
+trainer.train(1, train_dataset, callbacks=[MsprobeStep(debugger)])
+```
+
+#### 7.1.2 L2 级别
+
```python
import mindspore as ms
ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
@@ -301,7 +351,9 @@ trainer.train(1, train_dataset)
### 8.1 静态图场景
-训练结束后,数据将保存在 `dump_path` 指定的目录下。
+训练结束后,数据将保存在 `dump_path` 指定的目录下。
+L0 级别 dump 的目录结构与动态图场景下目录结构一致。
+L2 级别 dump 的目录结构如下所示:
若jit_level=O2,且使用mindstudio-probe发布包或源码编包时添加了`--include-mod=adump`选项,目录结构示例如下:
```
diff --git a/debug/accuracy_tools/msprobe/mindspore/common/const.py b/debug/accuracy_tools/msprobe/mindspore/common/const.py
index 067e783842..b41dc5ce01 100644
--- a/debug/accuracy_tools/msprobe/mindspore/common/const.py
+++ b/debug/accuracy_tools/msprobe/mindspore/common/const.py
@@ -61,6 +61,7 @@ class Const:
DROPOUT_API_NAME_PREFIX = "dropout"
GRAPH_DATA_MODE_LIST = [CoreConst.ALL, CoreConst.INPUT, CoreConst.OUTPUT]
+ GRAPH_CELL_DUMP_DATA_MODE_LIST = [CoreConst.ALL, CoreConst.FORWARD, CoreConst.BACKWARD]
HOOK_MS_PREFIX_DICT = {
OPS_DATA_PREFIX: OPS_PREFIX,
diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py
index 7694d71dd9..a7082d3e56 100644
--- a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py
+++ b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py
@@ -34,6 +34,7 @@ from msprobe.mindspore.ms_config import parse_json_config
from msprobe.mindspore.runtime import Runtime
from msprobe.mindspore.service import Service
from msprobe.mindspore.task_handler_factory import TaskHandlerFactory
+from msprobe.mindspore.dump.graph_mode_cell_dump import GraphModeCellDump
try:
from msprobe.lib import _msprobe_c
@@ -164,7 +165,7 @@ class PrecisionDebugger:
else:
if not instance.first_start:
api_register.api_set_ori_func()
- handler = TaskHandlerFactory.create(instance.config)
+ handler = TaskHandlerFactory.create(instance.config, model)
handler.handle()
instance.first_start = True
@@ -199,6 +200,9 @@ class PrecisionDebugger:
_msprobe_c._PrecisionDebugger().step()
if instance.task in PrecisionDebugger.task_not_need_service:
return
+ if instance.config.execution_mode != MsConst.PYNATIVE_MODE and instance.config.level == MsConst.CELL:
+ GraphModeCellDump.step()
+ return
if instance.service:
instance.service.step()
HOOKCell.cell_count = defaultdict(int)
diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
new file mode 100644
index 0000000000..a21c4590b8
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+import re
+import json
+import atexit
+from multiprocessing import Pool
+
+import numpy as np
+import mindspore as ms
+from mindspore import nn, ops
+
+from msprobe.mindspore.common.log import logger
+from msprobe.core.common.const import Const as CoreConst
+from msprobe.core.common.file_utils import load_npy, save_json, remove_path
+from msprobe.core.common.const import FileCheckConst
+
+
+CONSTRUCT_FILE_NAME = "construct.json"
+DEFAULT_RANK_DIR = "rank0"
+KEY_LAYERS = "layers"
+construct = {}
+cell_list = []
+KEY_SIDE_EFFECT = "side_effect_io"
+td = ops.TensorDump()
+td_in = ops.TensorDump("in")
+td.add_prim_attr(KEY_SIDE_EFFECT, False)
+td_in.add_prim_attr(KEY_SIDE_EFFECT, False)
+np_ms_dtype_dict = {
+ "bool": ms.bool_,
+ "int8": ms.int8,
+ "byte": ms.byte,
+ "int16": ms.int16,
+ "short": ms.short,
+ "int32": ms.int32,
+ "intc": ms.intc,
+ "int64": ms.int64,
+ "intp": ms.intp,
+ "uint8": ms.uint8,
+ "ubyte": ms.ubyte,
+ "uint16": ms.uint16,
+ "ushort": ms.ushort,
+ "uint32": ms.uint32,
+ "uintc": ms.uintc,
+ "uint64": ms.uint64,
+ "uintp": ms.uintp,
+ "float16": ms.float16,
+ "half": ms.half,
+ "float32": ms.float32,
+ "single": ms.single,
+ "float64": ms.float64,
+ "double": ms.double,
+ "bfloat16": ms.bfloat16,
+ "complex64": ms.complex64,
+ "complex128": ms.complex128
+}
+
+
+def generate_file_path(dump_path, cell_prefix, suffix, io_type, index):
+ step_path = os.path.join(dump_path, "{step}")
+ rank_path = os.path.join(step_path, "{rank}")
+ data_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
+ file_name = CoreConst.SEP.join([cell_prefix, suffix, io_type, str(index)])
+ return os.path.join(data_path, file_name)
+
+
+def partial_func(func, dump_path, cell_prefix, index, io_type):
+ def newfunc(*args, **kwargs):
+ return func(dump_path, cell_prefix, index, io_type, *args, **kwargs)
+ return newfunc
+
+
+def clip_gradient(dump_path, cell_prefix, index, io_type, dx):
+ if io_type == CoreConst.OUTPUT:
+ temp = td(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx)
+ dx = ops.depend(dx, temp)
+ if io_type == CoreConst.INPUT:
+ temp = td_in(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx)
+ dx = ops.depend(dx, temp)
+ return dx
+
+
+def cell_construct_wrapper(func, self):
+ def new_construct(self, *args, **kwargs):
+ new_args = []
+ out_list = []
+
+ index = 0
+ item = None
+ # The inputs of the cell.
+ for index, item in enumerate(args):
+ if self.data_mode == "backward" or self.data_mode == "all":
+ if ops.is_tensor(item):
+ item = self.output_clips[index](item)
+ if self.data_mode == "forward" or self.data_mode == "all":
+ if ops.is_tensor(item):
+ temp = td_in(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.INPUT, index), item)
+ item = ops.depend(item, temp)
+ new_args.append(item)
+
+ out = func(*new_args, **kwargs)
+
+ # The outputs of the cell.
+ if isinstance(out, tuple):
+ for index, item in enumerate(out):
+ if self.data_mode == "backward" or self.data_mode == "all":
+ if ops.is_tensor(item):
+ item = self.input_clips[index](item)
+ if self.data_mode == "forward" or self.data_mode == "all":
+ if ops.is_tensor(item):
+ temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), item)
+ item = ops.depend(item, temp)
+ out_list.append(item)
+ else:
+ out_list.append(item)
+ out_list = tuple(out_list)
+ return out_list
+ else:
+ if self.data_mode == "backward" or self.data_mode == "all":
+ out = self.input_clips[0](out)
+ if self.data_mode == "forward" or self.data_mode == "all":
+ if ops.is_tensor(out):
+ temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), out)
+ out = ops.depend(out, temp)
+ return out
+
+ return new_construct.__get__(self, type(self))
+
+
+# 获取目录下所有文件名并根据TensorDump落盘自增id从小到大排序
+def sort_filenames(path):
+ filenames = os.listdir(path)
+ id_pattern = re.compile(rf'{CoreConst.REPLACEMENT_CHARACTER}(\d+){CoreConst.NUMPY_SUFFIX}$')
+ filenames.sort(key=lambda x: int(id_pattern.findall(x)[0]))
+ return filenames
+
+
+# 删除重复dump的文件:自定义文件名相同,并且数据相同
+def del_same_file(path, filenames):
+ result_list = []
+ seen_prefixes = {}
+ for current_filename in filenames:
+ parts = current_filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)
+ prefix = parts[0]
+ if prefix not in seen_prefixes:
+ result_list.append(current_filename)
+ seen_prefixes[prefix] = current_filename
+ else:
+ current_file_path = os.path.join(path, current_filename)
+ current_file = load_npy(current_file_path)
+ prev_filename = seen_prefixes[prefix]
+ prev_file_path = os.path.join(path, prev_filename)
+ prev_file = load_npy(prev_file_path)
+ if np.array_equal(current_file, prev_file):
+ remove_path(current_file_path)
+ logger.warning(f"{current_file_path} is deleted!")
+ else:
+ result_list.append(current_filename)
+ return result_list
+
+
+def rename_filename(path):
+ filenames = sort_filenames(path)
+ filenames = del_same_file(path, filenames)
+
+ filename_dict = {}
+ for filename in filenames:
+ name_field = filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)[0]
+
+ if name_field in filename_dict:
+ filename_dict[name_field] += 1
+ else:
+ filename_dict[name_field] = 0
+
+ cell_index = filename_dict[name_field]
+
+ # 修改文件名,增加重复调用Cell的序号
+ if CoreConst.FORWARD_PATTERN in filename:
+ #Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy
+ newFileName = filename.replace(CoreConst.FORWARD_PATTERN, CoreConst.FORWARD_PATTERN + str(cell_index) + CoreConst.SEP)
+ if CoreConst.BACKWARD_PATTERN in filename:
+ newFileName = filename.replace(CoreConst.BACKWARD_PATTERN, CoreConst.BACKWARD_PATTERN + str(cell_index) + CoreConst.SEP)
+ os.rename(os.path.join(path, filename), os.path.join(path, newFileName))
+ logger.info(f"==========The rename_filename phase is Finished!==========")
+
+
+# Extract the field between the first "." and the third to last ".", i.e. {cell_name}
+def get_cell_name(str):
+ parts = str.split(CoreConst.SEP)
+ if len(parts) < 4:
+ return None
+ start_index = 1
+ end_index = len(parts) - 3
+ return CoreConst.SEP.join(parts[start_index:end_index])
+
+
+# Extract the field between the last "." and the second to last ".", i.e. {data_made}
+def get_data_mode(str):
+ last_dot_index = str.rfind(CoreConst.SEP)
+ second_last_dot_index = str.rfind(CoreConst.SEP, 0, last_dot_index)
+ data_mode = str[second_last_dot_index + 1:last_dot_index]
+ return data_mode
+
+
+# 判断二者之间是否存在父子关系
+def check_relation(cell_name, parent_cell_name):
+ layers_pattern = rf"{CoreConst.SEP}{KEY_LAYERS}{CoreConst.SEP}\d+$"
+ last_dot_index = cell_name.rfind(CoreConst.SEP)
+ if last_dot_index != -1:
+ # 如果cell_name最后一个'.'之前的字段等于parent_cell_name,则判定存在父子关系
+ sub_cell_name = cell_name[:last_dot_index]
+ if sub_cell_name == parent_cell_name:
+ return True
+ elif re.search(layers_pattern, cell_name):
+ # 如果cell_name以".layer.{layer_id}"结尾,且去掉该字段后等于parent_cell_name,则判定存在父子关系
+ sub_cell_name = re.sub(layers_pattern, '', cell_name)
+ if sub_cell_name == parent_cell_name:
+ return True
+ return False
+
+
+def get_construct(cell_list_input):
+ for cell in cell_list_input:
+ cell_name = get_cell_name(cell)
+ cell_data_mode = get_data_mode(cell)
+ found_flag = False
+ for parent_cell in cell_list_input:
+ parent_cell_name = get_cell_name(parent_cell)
+ parent_data_mode = get_data_mode(parent_cell)
+ has_relation = check_relation(cell_name, parent_cell_name)
+ if has_relation and parent_data_mode == cell_data_mode:
+ construct.update({cell: parent_cell})
+ found_flag = True
+ break
+ if not found_flag:
+ construct.update({cell: None})
+
+
+def generate_construct(path):
+ global construct
+ filenames = sort_filenames(path)
+
+ # 提取文件名中Cell.{cell_name}.{class_name}.{data_mode}.{重复调用此cell的序号}字段,并存入cell_list
+ for filename in filenames:
+ point_position = 3
+ mid_field = filename.rsplit(CoreConst.SEP, point_position)[0]
+ if CoreConst.INPUT in filename:
+ if mid_field in cell_list:
+ cell_list.remove(mid_field)
+ cell_list.append(mid_field)
+ else:
+ if mid_field not in cell_list:
+ index = filenames.index(filename)
+ output_field = mid_field + CoreConst.OUTPUT
+ find_flag = False
+ for filename_other in cell_list[index + 1:]:
+ if output_field in filename_other:
+ find_flag = True
+ if find_flag is False:
+ cell_list.append(mid_field)
+
+ get_construct(cell_list)
+
+ # 生成JSON文件
+ rank_dir = os.path.dirname(path)
+ json_path = os.path.join(rank_dir, CONSTRUCT_FILE_NAME)
+ save_json(json_path, construct, indent=1)
+
+ # 清空'construct'继续处理下一个路径下的数据
+ construct = {}
+ logger.info(f"Construct data saved to {json_path}")
+
+
+def process_file(file_path):
+ try:
+ # 读取.npy文件内容
+ npy_content = load_npy(file_path)
+ logger.info(f"Loaded {file_path}: shape is {npy_content.shape}, dtype is {npy_content.dtype}")
+
+ # 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy
+ parts = os.path.basename(file_path).split(CoreConst.SEP)
+ data_dtype = ""
+ # 获取0_float32_165或者0_in_float32_165中的float32
+ data_dtype_list = parts[-2].split('_')
+ if len(data_dtype_list) > 1:
+ data_dtype = data_dtype_list[-2]
+ # op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0
+ op_name = CoreConst.SEP.join(parts[:-3])
+ ms_dtype = np_ms_dtype_dict.get(data_dtype)
+ if ms_dtype is None:
+ logger.warning(f"Get dtype None from file {file_path}")
+ tensor_json = {
+ CoreConst.TYPE: 'mindspore.Tensor',
+ CoreConst.DTYPE: str(ms_dtype),
+ CoreConst.SHAPE: list(npy_content.shape),
+ CoreConst.MAX: npy_content.max().item(),
+ CoreConst.MIN: npy_content.min().item(),
+ CoreConst.MEAN: npy_content.mean().item(),
+ CoreConst.NORM: np.linalg.norm(npy_content).item(),
+ CoreConst.DATA_NAME: os.path.basename(file_path)
+ }
+
+ # 根据文件名的最后一个部分(输入或输出)确定是添加到input_args还是output
+ if parts[-3] == CoreConst.INPUT:
+ return op_name, CoreConst.INPUT_ARGS, tensor_json
+ elif parts[-3] == CoreConst.OUTPUT:
+ return op_name, CoreConst.OUTPUT, tensor_json
+ else:
+ return None, None, None
+
+ except Exception as e:
+ logger.error(f"Error reading {file_path}: {e}")
+ return None, None, None
+
+
+def custom_sort(item, key_to_index):
+ key = item[0]
+ return key_to_index.get(key, float('inf'))
+
+
+def generate_dump_info(path):
+ if not os.path.exists(path):
+ logger.error("The provided path does not exist.")
+ return
+
+ dump_data = {"task": "tensor", "level": "L0", "dump_data_dir": path, "data": {}}
+
+ with Pool(processes=10) as pool:
+ file_paths = []
+ for root, _, files in os.walk(path):
+ for file in files:
+ if file.endswith(FileCheckConst.NUMPY_SUFFIX):
+ file_paths.append((os.path.join(root, file),))
+ file_paths.sort()
+ results = pool.starmap(process_file, file_paths)
+
+ # 收集结果
+ for op_name, key, tensor_json in results:
+ if op_name:
+ if op_name not in dump_data.get(CoreConst.DATA, {}):
+ dump_data.get(CoreConst.DATA, {})[op_name] = {CoreConst.INPUT_ARGS: [],
+ CoreConst.INPUT_KWARGS: {},
+ CoreConst.OUTPUT: []}
+ if key not in dump_data.get(CoreConst.DATA, {}).get(op_name, {}):
+ dump_data.get(CoreConst.DATA, {}).get(op_name, {})[key] = []
+ dump_data.get(CoreConst.DATA, {}).get(op_name, {}).get(key, []).append(tensor_json)
+
+ # 根据cell_list排序
+ data_dict = dump_data.get(CoreConst.DATA, {})
+ key_to_index = {key: index for index, key in enumerate(cell_list)}
+ sorted_data_dict = dict(sorted(data_dict.items(), key=lambda item: custom_sort(item, key_to_index)))
+ dump_data[CoreConst.DATA] = sorted_data_dict
+
+ # 将数据写入dump.json
+ json_path = os.path.join(os.path.dirname(path), 'dump.json')
+ save_json(json_path, dump_data, indent=1)
+
+ logger.info(f"Dump data saved to {json_path}")
+
+
+def generate_stack_info(path):
+ if not os.path.exists(path):
+ logger.error("The provided path does not exist.")
+ return
+
+ stack_data = {}
+ file_paths = []
+ # 传入的path为工具生成的./dump_tensor_data,内容为npy文件
+ for root, _, files in os.walk(path):
+ for file in files:
+ if file.endswith(FileCheckConst.NUMPY_SUFFIX):
+ file_paths.append(os.path.join(root, file))
+ file_paths.sort()
+ for file_path in file_paths:
+ # 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy
+ parts = os.path.basename(file_path).split(CoreConst.SEP)
+ # op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0
+ op_name = CoreConst.SEP.join(parts[:-3])
+ stack_data.update({op_name: []})
+
+ # 将数据写入stack.json
+ json_path = os.path.join(os.path.dirname(path), 'stack.json')
+ save_json(json_path, stack_data, indent=1)
+
+ logger.info(f"Stack data saved to {json_path}")
+
+
+def process(dump_path):
+ logger.info(f"==========Start processing data that has already been stored on the disk!==========")
+ rank_id = os.environ.get('RANK_ID')
+ rank_dir = DEFAULT_RANK_DIR
+ if rank_id is not None:
+ rank_dir = CoreConst.RANK + str(rank_id)
+
+ step_dir_list = os.listdir(dump_path)
+ for step_dir in step_dir_list:
+ step_path = os.path.join(dump_path, step_dir)
+ rank_path = os.path.join(step_path, rank_dir)
+ npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
+ rename_filename(npy_path)
+ generate_construct(npy_path)
+ generate_dump_info(npy_path)
+ generate_stack_info(npy_path)
+
+
+def start(net=None, dump_path="./", data_mode=CoreConst.ALL):
+ if net is None:
+ return
+
+ black_list = ["grad_reducer", ""]
+ for name, cell in net.cells_and_names():
+ class_name = cell.__class__.__name__
+ # 跳过黑名单cell
+ if name in black_list:
+ logger.info(f"Cell {name}.{class_name} is skipped!")
+ continue
+ # 跳过框架内部的cell
+ if class_name.startswith(CoreConst.REPLACEMENT_CHARACTER):
+ logger.info(f"Cell {name}.{class_name} is skipped!")
+ continue
+ else:
+ #Format: Cell.{cell_name}.{class_name}
+ cell.cell_prefix = CoreConst.SEP.join([CoreConst.CELL, name, cell.__class__.__name__])
+
+ cell.construct = cell_construct_wrapper(cell.construct, cell)
+ logger.info(f"Cell {name}: construct function is wrapped!")
+ cell.dump_path = dump_path
+ cell.data_mode = data_mode
+ cell.input_clips = []
+ cell.output_clips = []
+ # It is assumed that each cell has a maximum of 50 outputs and 50 inputs.
+ for i in range(50):
+ cell.input_clips.append(ops.InsertGradientOf(partial_func(clip_gradient, cell.dump_path, cell.cell_prefix, i, CoreConst.INPUT)))
+ cell.output_clips.append(ops.InsertGradientOf(partial_func(clip_gradient, cell.dump_path, cell.cell_prefix, i, CoreConst.OUTPUT)))
+
+ logger.info(f"==========The cell_dump_process_start phase is Finished!==========")
+ atexit.register(process, dump_path=dump_path)
diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py
index 0ca63b4a84..c0933d20aa 100644
--- a/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py
+++ b/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py
@@ -17,13 +17,14 @@ from msprobe.mindspore.common.const import Const
from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump
from msprobe.mindspore.dump.kernel_kbyk_dump import KernelKbykDump
+from msprobe.mindspore.dump.graph_mode_cell_dump import GraphModeCellDump
class DumpToolFactory:
tools = {
Const.CELL: {
- Const.GRAPH_KBYK_MODE: None,
- Const.GRAPH_GE_MODE: None,
+ Const.GRAPH_KBYK_MODE: GraphModeCellDump,
+ Const.GRAPH_GE_MODE: GraphModeCellDump,
Const.PYNATIVE_MODE: None
},
Const.API: {
@@ -39,9 +40,13 @@ class DumpToolFactory:
}
@staticmethod
- def create(config: DebuggerConfig):
- if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_DATA_MODE_LIST:
- raise Exception("data_mode must be one of all, input, output.")
+ def create(config: DebuggerConfig, model):
+ if config.level == Const.CELL:
+ if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_CELL_DUMP_DATA_MODE_LIST:
+ raise Exception("data_mode must be one of all, forward, backward.")
+ else:
+ if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_DATA_MODE_LIST:
+ raise Exception("data_mode must be one of all, input, output.")
tool = DumpToolFactory.tools.get(config.level)
if not tool:
raise Exception("Valid level is needed.")
@@ -49,4 +54,4 @@ class DumpToolFactory:
if not tool:
raise Exception(f"Data dump is not supported in {config.execution_mode} mode "
f"when dump level is {config.level}.")
- return tool(config)
+ return tool(config, model) if tool == GraphModeCellDump else tool(config)
diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py b/debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py
new file mode 100644
index 0000000000..e32866868f
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from msprobe.mindspore.common.log import logger
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+import mindspore as ms
+from mindspore._c_expression import _tensordump_set_step
+from mindspore.ops.primitive import _run_op
+from mindspore import hal, ops
+import msprobe.mindspore.dump.cell_dump_process as cellDumper
+from msprobe.mindspore.common.const import Const
+
+
+class GraphModeCellDump:
+ def __init__(self, config: DebuggerConfig, model):
+ self.net = model
+ self.white_list = []
+ self.black_list = []
+ self.dump_path = config.dump_path if config.dump_path else "./"
+ self.rank = config.rank
+ self.step = config.step
+ self.scope = config.scope
+ self.list = config.list
+ self.data_mode = config.data_mode
+ self.file_format = config.file_format
+ self.check_config()
+ self.set_step()
+
+ @staticmethod
+ def step():
+ hal.synchronize()
+ temp_tensor = ms.Tensor([1], dtype=ms.float32)
+ step_flag = ""
+ _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor))
+ ops.tensordump(step_flag, temp_tensor)
+
+ def check_config(self):
+ if self.rank != []:
+ raise Exception("In graph mode, cell dump does not currently support specifying rank.")
+ if self.scope != []:
+ raise Exception("In graph mode, cell dump does not currently support specifying scope.")
+ if self.list != []:
+ raise Exception("In graph mode, cell dump does not currently support specifying list.")
+ if len(self.data_mode) != 1 or self.data_mode[0] not in Const.GRAPH_CELL_DUMP_DATA_MODE_LIST:
+ raise Exception("In graph mode and cell dump, data_mode must be one of all, forword, backword.")
+ if self.file_format != []:
+ logger.warning("In graph mode, cell dump does not currently support specifying file_format. The file will be stored in npy format.")
+ if not self.net:
+ raise Exception("The model is empty and cell dump is not enabled.")
+ return True
+
+ def set_step(self):
+ _tensordump_set_step(self.step)
+
+ def handle(self):
+ os.environ['MS_JIT_MODULES'] = 'msprobe'
+ cellDumper.start(net=self.net, dump_path=self.dump_path, data_mode=self.data_mode[0])
diff --git a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py
index a9cb5e6dd4..5cfbbaeb4a 100644
--- a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py
+++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py
@@ -29,11 +29,14 @@ class TaskHandlerFactory:
}
@staticmethod
- def create(config: DebuggerConfig):
+ def create(config: DebuggerConfig, model):
task = TaskHandlerFactory.tasks.get(config.task)
if not task:
raise Exception("Valid task is needed.")
- handler = task.create(config)
+ if task == DumpToolFactory:
+ handler = task.create(config, model)
+ else:
+ handler = task.create(config)
if not handler:
raise Exception("Can not find task handler")
return handler
diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py
new file mode 100644
index 0000000000..b111e64437
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import unittest
+from unittest.mock import MagicMock, patch
+
+import mindspore as ms
+from mindspore import ops
+
+from msprobe.core.common.const import Const as CoreConst
+from msprobe.mindspore.dump.cell_dump_process import generate_file_path
+from msprobe.mindspore.dump.cell_dump_process import partial_func, clip_gradient
+from msprobe.mindspore.dump.cell_dump_process import cell_construct_wrapper
+from msprobe.mindspore.dump.cell_dump_process import rename_filename, sort_filenames, del_same_file
+from msprobe.mindspore.dump.cell_dump_process import check_relation
+
+
+class TestGenerateFilePath(unittest.TestCase):
+ def setUp(self):
+ self.dump_path = "/path"
+ self.cell_prefix = "Cell.network._backbone.LlamaForCausalLM"
+ self.suffix = "forward"
+ self.io_type = "input"
+ self.index = 0
+
+ def test_generate_file_path(self):
+ expected_path = os.path.join(
+ self.dump_path,
+ "{step}",
+ "{rank}",
+ CoreConst.DUMP_TENSOR_DATA,
+ CoreConst.SEP.join([self.cell_prefix, self.suffix, self.io_type, str(self.index)])
+ )
+ result = generate_file_path(self.dump_path, self.cell_prefix, self.suffix, self.io_type, self.index)
+ self.assertEqual(result, expected_path)
+
+
+class TestPartialFunc(unittest.TestCase):
+
+ @patch('msprobe.mindspore.dump.cell_dump_process.CoreConst')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td_in')
+ @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path')
+ @patch('msprobe.mindspore.dump.cell_dump_process.ops.depend')
+ def test_clip_gradient_output(self, mock_depend, mock_generate_file_path, mock_td_in, mock_td, mock_CoreConst):
+ mock_CoreConst.OUTPUT = "output"
+ mock_CoreConst.BACKWARD = "backward"
+ mock_generate_file_path.return_value = "mock_path"
+ mock_td.return_value = "temp_tensor"
+ mock_depend.return_value = "dependent_tensor"
+
+ result = clip_gradient("dump_path", "cell_prefix", 0, "output", "dx")
+
+ mock_generate_file_path.assert_called_with("dump_path", "cell_prefix", "backward", "output", 0)
+ mock_td.assert_called_with("mock_path", "dx")
+ mock_depend.assert_called_with("dx", "temp_tensor")
+ self.assertEqual(result, "dependent_tensor")
+
+ @patch('msprobe.mindspore.dump.cell_dump_process.CoreConst')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td_in')
+ @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path')
+ @patch('msprobe.mindspore.dump.cell_dump_process.ops.depend')
+ def test_clip_gradient_input(self, mock_depend, mock_generate_file_path, mock_td_in, mock_td, mock_CoreConst):
+ mock_CoreConst.INPUT = "input"
+ mock_CoreConst.BACKWARD = "backward"
+ mock_generate_file_path.return_value = "mock_path"
+ mock_td_in.return_value = "temp_tensor"
+ mock_depend.return_value = "dependent_tensor"
+
+ result = clip_gradient("dump_path", "cell_prefix", 0, "input", "dx")
+
+ mock_generate_file_path.assert_called_with("dump_path", "cell_prefix", "backward", "input", 0)
+ mock_td_in.assert_called_with("mock_path", "dx")
+ mock_depend.assert_called_with("dx", "temp_tensor")
+ self.assertEqual(result, "dependent_tensor")
+
+ def test_partial_func(self):
+ def mock_func(dump_path, cell_prefix, index, io_type, *args, **kwargs):
+ return dump_path, cell_prefix, index, io_type, args, kwargs
+
+ new_func = partial_func(mock_func, "dump_path", "cell_prefix", 0, "io_type")
+ result = new_func("arg1", "arg2", kwarg1="value1")
+
+ self.assertEqual(result, ("dump_path", "cell_prefix", 0, "io_type", ("arg1", "arg2"), {'kwarg1': 'value1'}))
+
+
+class TestCellWrapperProcess(unittest.TestCase):
+
+ @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td_in')
+ def test_cell_construct_wrapper(self, mock_td_in, mock_td, mock_generate_file_path):
+ # Mock the generate_file_path function
+ mock_generate_file_path.return_value = "mock_path"
+
+ # Mock the TensorDump operations
+ mock_td.return_value = MagicMock()
+ mock_td_in.return_value = MagicMock()
+
+ # Create a mock cell with necessary attributes
+ mock_cell = MagicMock()
+ mock_cell.data_mode = "all"
+ mock_cell.dump_path = "mock_dump_path"
+ mock_cell.cell_prefix = "mock_cell_prefix"
+ mock_cell.input_clips = [MagicMock() for _ in range(50)]
+ mock_cell.output_clips = [MagicMock() for _ in range(50)]
+
+ # Define a mock function to wrap
+ def mock_func(*args, **kwargs):
+ return args
+
+ # Wrap the mock function using cell_construct_wrapper
+ wrapped_func = cell_construct_wrapper(mock_func, mock_cell)
+
+ # Create mock inputs
+ mock_input = ms.Tensor([1, 2, 3])
+ mock_args = (mock_input,)
+
+ # Call the wrapped function
+ result = wrapped_func(mock_cell, *mock_args)
+
+ # Check if the result is as expected
+ self.assertEqual(result, mock_args)
+
+ # Verify that the TensorDump operations were called
+ mock_td_in.assert_called()
+ mock_td.assert_called()
+
+ @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td')
+ @patch('msprobe.mindspore.dump.cell_dump_process.td_in')
+ def test_cell_construct_wrapper_with_tuple_output(self, mock_td_in, mock_td, mock_generate_file_path):
+ # Mock the generate_file_path function
+ mock_generate_file_path.return_value = "mock_path"
+
+ # Mock the TensorDump operations
+ mock_td.return_value = MagicMock()
+ mock_td_in.return_value = MagicMock()
+
+ # Create a mock cell with necessary attributes
+ mock_cell = MagicMock()
+ mock_cell.data_mode = "all"
+ mock_cell.dump_path = "mock_dump_path"
+ mock_cell.cell_prefix = "mock_cell_prefix"
+ mock_cell.input_clips = [MagicMock() for _ in range(50)]
+ mock_cell.output_clips = [MagicMock() for _ in range(50)]
+
+ # Define a mock function to wrap
+ def mock_func(*args, **kwargs):
+ return (args[0], args[0])
+
+ # Wrap the mock function using cell_construct_wrapper
+ wrapped_func = cell_construct_wrapper(mock_func, mock_cell)
+
+ # Create mock inputs
+ mock_input = ms.Tensor([1, 2, 3])
+ mock_args = (mock_input,)
+
+ # Call the wrapped function
+ result = wrapped_func(mock_cell, *mock_args)
+
+ # Check if the result is as expected
+ self.assertEqual(result, (mock_input, mock_input))
+
+ # Verify that the TensorDump operations were called
+ mock_td_in.assert_called()
+ mock_td.assert_called()
+
+
+class TestSortFilenames(unittest.TestCase):
+
+ @patch('os.listdir')
+ def test_sort_filenames(self, mock_listdir):
+ # Mock the list of filenames returned by os.listdir
+ mock_listdir.return_value = [
+ 'Cell.network._backbone.model.LlamaModel.backward.0.input.0_float16_177.npy',
+ 'Cell.network._backbone.model.LlamaModel.forward.0.input.0_in_int32_1.npy',
+ 'Cell.network._backbone.model.LlamaModel.forward.0.output.10_float16_165.npy',
+ 'Cell.network._backbone.model.norm_out.LlamaRMSNorm.backward.0.input.0_float16_178.npy'
+ ]
+
+ # Mock the CoreConst values
+ CoreConst.REPLACEMENT_CHARACTER = '_'
+ CoreConst.NUMPY_SUFFIX = '.npy'
+
+ # Expected sorted filenames
+ expected_sorted_filenames = [
+ 'Cell.network._backbone.model.LlamaModel.forward.0.input.0_in_int32_1.npy',
+ 'Cell.network._backbone.model.LlamaModel.forward.0.output.10_float16_165.npy',
+ 'Cell.network._backbone.model.LlamaModel.backward.0.input.0_float16_177.npy',
+ 'Cell.network._backbone.model.norm_out.LlamaRMSNorm.backward.0.input.0_float16_178.npy'
+ ]
+
+ # Call the function
+ sorted_filenames = sort_filenames('/mock/path')
+
+ # Assert the filenames are sorted correctly
+ self.assertEqual(sorted_filenames, expected_sorted_filenames)
+
+
+class TestRenameFilename(unittest.TestCase):
+
+ @patch('msprobe.mindspore.dump.cell_dump_process.sort_filenames')
+ @patch('msprobe.mindspore.dump.cell_dump_process.del_same_file')
+ @patch('msprobe.mindspore.dump.cell_dump_process.os.rename')
+ def test_rename_filename(self, mock_rename, mock_del_same_file, mock_sort_filenames):
+ # Mock the constants
+ CoreConst.REPLACEMENT_CHARACTER = '_'
+ CoreConst.FORWARD_PATTERN = '.forward.'
+ CoreConst.BACKWARD_PATTERN = '.backward.'
+ CoreConst.SEP = '.'
+
+ # Mock the filenames
+ mock_sort_filenames.return_value = [
+ "Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_101.npy",
+ "Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_102.npy",
+ "Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.0_float32_103.npy",
+ "Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.1_bool_104.npy",
+ "Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.output.1_bool_105.npy",
+ "Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_111.npy",
+ "Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_112.npy",
+ ]
+ mock_del_same_file.return_value = [mock_sort_filenames.return_value]
+
+ # Call the function
+ rename_filename('/mock/path')
+
+ # Check if os.rename was called with the correct arguments
+ mock_rename.assert_any_call(
+ '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_101.npy',
+ '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.0.input_0_int32_101.npy'
+ )
+ mock_rename.assert_any_call(
+ '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_102.npy',
+ '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.0.output_0_float32_102.npy'
+ )
+ mock_rename.assert_any_call(
+ '/mock/path/Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.0_float32_103.npy',
+ '/mock/path/Cell_loss_scaling_manager_DynamicLossScaleUpdateCell.backward.0.input_0_float32_103.npy'
+ )
+ mock_rename.assert_any_call(
+ '/mock/path/Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.1_bool_104.npy',
+ '/mock/path/Cell_loss_scaling_manager_DynamicLossScaleUpdateCell.backward.0.input_1_bool_104.npy'
+ )
+ mock_rename.assert_any_call(
+ '/mock/path/Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.output.1_bool_105.npy',
+ '/mock/path/Cell_loss_scaling_manager_DynamicLossScaleUpdateCell.backward.0.output_1_bool_105.npy'
+ )
+ mock_rename.assert_any_call(
+ '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_111.npy',
+ '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.1.input_0_int32_111.npy'
+ )
+ mock_rename.assert_any_call(
+ '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_112.npy',
+ '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.1.output_0_float32_112.npy'
+ )
+
+ # Mock the filenames
+ mock_sort_filenames.return_value = []
+ mock_del_same_file.return_value = []
+
+ # Call the function
+ rename_filename('/mock/path')
+
+ # Check if os.rename was not called
+ mock_rename.assert_not_called()
+
+
+class TestCheckRelation(unittest.TestCase):
+
+ def setUp(self):
+ CoreConst.SEP = '.'
+ global KEY_LAYERS
+ KEY_LAYERS = "layers"
+
+ def test_direct_parent_child_relation(self):
+ self.assertTrue(check_relation("network._backbone", "network"))
+ self.assertTrue(check_relation("network._backbone.model", "network._backbone"))
+
+ def test_no_relation(self):
+ self.assertFalse(check_relation("network._backbone", "network.loss"))
+ self.assertFalse(check_relation("network._backbone.model", "network.loss"))
+
+ def test_layer_pattern_relation(self):
+ self.assertTrue(check_relation("network.model.layers.0", "network.model"))
+ self.assertTrue(check_relation("network._backbone.model.layers.1", "network._backbone.model"))
+
+ def test_no_layer_pattern_relation(self):
+ self.assertFalse(check_relation("network.model.layers.0", "network.loss"))
+ self.assertFalse(check_relation("network._backbone.model.layers.1", "network._backbone.model.layers"))
+
+ def test_edge_cases(self):
+ self.assertFalse(check_relation("", "network"))
+ self.assertFalse(check_relation("network.layer1", ""))
+ self.assertFalse(check_relation("", ""))
--
Gitee
From 3ac2ba51aefc8800092cb5f4b064f5a31389ba73 Mon Sep 17 00:00:00 2001
From: fuchao <1501312275@qq.com>
Date: Mon, 17 Feb 2025 18:18:33 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E4=BC=98=E5=8C=96=E9=9D=99=E6=80=81?=
=?UTF-8?q?=E5=9B=BEcell=E7=BA=A7dump=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../mindspore/dump/cell_dump_process.py | 50 +++++++++++++++++--
1 file changed, 47 insertions(+), 3 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
index a21c4590b8..a9121e1435 100644
--- a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
+++ b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
@@ -133,7 +133,7 @@ def cell_construct_wrapper(func, self):
out = self.input_clips[0](out)
if self.data_mode == "forward" or self.data_mode == "all":
if ops.is_tensor(out):
- temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), out)
+ temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, 0), out)
out = ops.depend(out, temp)
return out
@@ -302,6 +302,21 @@ def process_file(file_path):
ms_dtype = np_ms_dtype_dict.get(data_dtype)
if ms_dtype is None:
logger.warning(f"Get dtype None from file {file_path}")
+
+ #修改落盘文件名字,去掉TensorDump自带的数据类型和自增id字段
+ data_file_name = os.path.basename(file_path)
+ data_file_dir = os.path.dirname(file_path)
+ parts = data_file_name.split(CoreConst.SEP)
+ if len(parts) >= 2:
+ param_index = parts[-2].split(CoreConst.REPLACEMENT_CHARACTER)[0]
+ pre_parts = CoreConst.SEP.join(parts[:-2])
+ new_file_name = pre_parts + CoreConst.SEP + param_index + CoreConst.NUMPY_SUFFIX
+ os.rename(os.path.join(data_file_dir, data_file_name), os.path.join(data_file_dir, new_file_name))
+ logger.info(f"{data_file_name} is renamed to {new_file_name}")
+ else:
+ logger.warning(f"Failed to rename {data_file_name}.")
+ new_file_name = data_file_name
+
tensor_json = {
CoreConst.TYPE: 'mindspore.Tensor',
CoreConst.DTYPE: str(ms_dtype),
@@ -310,7 +325,7 @@ def process_file(file_path):
CoreConst.MIN: npy_content.min().item(),
CoreConst.MEAN: npy_content.mean().item(),
CoreConst.NORM: np.linalg.norm(npy_content).item(),
- CoreConst.DATA_NAME: os.path.basename(file_path)
+ CoreConst.DATA_NAME: new_file_name
}
# 根据文件名的最后一个部分(输入或输出)确定是添加到input_args还是output
@@ -398,8 +413,28 @@ def generate_stack_info(path):
logger.info(f"Stack data saved to {json_path}")
+def is_download_finished(directory, interval=3):
+ """
+ 判断指定目录在一段时间后是否有数据被下载完成
+ :param directory: 指定目录的路径
+ :param interval: 检查的时间间隔(秒),默认为 3 秒
+ :return: 如有数据被下载完成返回 True,否则返回 False
+ """
+ # 检查目录是否存在
+ if not os.path.exists(directory):
+ logger.warning(f"The specified directory {directory} does not exist.")
+ return False
+ initial_modification_time = os.path.getmtime(directory)
+ time.sleep(interval)
+ current_modification_time = os.path.getmtime(directory)
+ # 比较初始和当前修改时间
+ if current_modification_time > initial_modification_time:
+ return False
+ else:
+ return True
+
+
def process(dump_path):
- logger.info(f"==========Start processing data that has already been stored on the disk!==========")
rank_id = os.environ.get('RANK_ID')
rank_dir = DEFAULT_RANK_DIR
if rank_id is not None:
@@ -410,10 +445,19 @@ def process(dump_path):
step_path = os.path.join(dump_path, step_dir)
rank_path = os.path.join(step_path, rank_dir)
npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA)
+ while True:
+ is_finished = is_download_finished(npy_path)
+ if not is_finished:
+ logger.info(f"There is data being downloaded in the specified directory, continue checking...")
+ else:
+ logger.info(f"There is no data being downloaded in the specified directory, Stop checking.")
+ break
+ logger.info(f"==========Start processing data that has already been stored on the disk!==========")
rename_filename(npy_path)
generate_construct(npy_path)
generate_dump_info(npy_path)
generate_stack_info(npy_path)
+ logger.info(f"==========JSON file generation completed!==========")
def start(net=None, dump_path="./", data_mode=CoreConst.ALL):
--
Gitee