From f40ed9cfe095a3291704c7327437e6d54891329b Mon Sep 17 00:00:00 2001 From: wugengjun <451676383@qq.com> Date: Wed, 22 Jan 2025 21:04:56 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E9=9D=99=E6=80=81=E5=9B=BEcell=E7=BA=A7dum?= =?UTF-8?q?p?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/common/const.py | 5 + .../msprobe/docs/02.config_introduction.md | 20 +- .../msprobe/docs/06.data_dump_MindSpore.md | 62 ++- .../msprobe/mindspore/common/const.py | 1 + .../mindspore/debugger/precision_debugger.py | 6 +- .../mindspore/dump/cell_dump_process.py | 450 ++++++++++++++++++ .../mindspore/dump/dump_tool_factory.py | 17 +- .../mindspore/dump/graph_mode_cell_dump.py | 69 +++ .../msprobe/mindspore/task_handler_factory.py | 7 +- .../debugger/test_graph_cell_dump.py | 309 ++++++++++++ 10 files changed, 922 insertions(+), 24 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py create mode 100644 debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index b49b4fffd5..6824fc8b42 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -206,12 +206,14 @@ class Const: TORCH_FLOAT32 = "torch.float32" TORCH_BFLOAT16 = "torch.bfloat16" + TYPE = 'type' DTYPE = 'dtype' SHAPE = 'shape' MAX = 'Max' MIN = 'Min' MEAN = 'Mean' NORM = 'Norm' + DATA_NAME = 'data_name' CODE_STACK = 'Code Stack' OP_NAME = 'Op Name' @@ -224,6 +226,9 @@ class Const: SCOPE_SEPARATOR = "/" REPLACEMENT_CHARACTER = "_" + FORWARD_PATTERN = SEP + FORWARD + SEP + BACKWARD_PATTERN = SEP + BACKWARD + SEP + OPTIMIZER = "optimizer" CLIP_GRAD = "clip_grad" END_PREFIX = "end_" diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md index f134bd4536..5b2e6d5027 100644 --- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md +++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md @@ -10,19 +10,19 @@ ### 1.1 通用配置 -| 参数 | 解释 | 是否必选 | -| ----------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | -| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对;
"grad_probe":梯度监控;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 | -| dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 | -| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 | -| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 | -| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,仅 PyTorch 与 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore场景详细介绍见 [MindSpore 场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch 与 MindSpore 动态图场景支持。
"debug":单点保存功能,细节详见[单点保存工具 README](./28.debugger_save_instruction.md)
**配置示例**:"level": "L1"。 | 否 | -| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 | +| 参数 | 解释 | 是否必选 | +| ----------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | +| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对;
"grad_probe":梯度监控。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 | +| dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 | +| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。静态图 L0 级别 dump 暂不支持指定rank。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 | +| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 | +| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,PyTorch 与 MindSpore 均支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch 与 MindSpore 动态图场景支持。
**配置示例**:"level": "L1"。 | 否 | +| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 | | async_dump | 异步 dump 开关,bool 类型。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor
的统计量计算。 | 否 | #### 1.1.1 模块级精度数据 dump 说明 -仅 PyTorch 与 MindSpore 动态图场景支持。 +PyTorch 与 MindSpore 均支持。 大模型场景下,通常不是简单的利用自动迁移能力实现从 GPU 到 NPU 的训练脚本迁移,而是会对 NPU 网络进行一系列针对性的适配,因此,常常会造成迁移后的 NPU 模型存在部分子结构不能与 GPU 原始模型完全对应。模型结构不一致导致 API 调用类型及数量不一致,若直接按照 API 粒度进行精度数据 dump 和比对,则无法完全比对所有的 API。 @@ -46,7 +46,7 @@ MindSpore 静态图场景配置 kernel_name,可以是算子的名称列表,也可以指定算子类型("level": "L2"时不支持),还可以配置算子名称的正则表达式(当字符串符合“name-regex(xxx)”格式时,后台则会将其作为正则表达式。
配置示例:list: ["name-regex(Default/.+)"]
可匹配算子名称以“Default/”开头的所有算子。 data_modedump 数据过滤,str 类型。否 PyTorch 与 MindSpore 动态图场景:支持"all"、"forward"、"backward"、"input"和"output",除"all"外,其余参数可以自由组合。默认为["all"],即保存所有 dump 的数据。
配置示例:"data_mode": ["backward"] (仅保存反向数据)或 "data_mode": ["forward", "input"](仅保存前向的输入数据)。 - MindSpore 静态图场景:仅支持"all"、"input"和"output"参数,且各参数只能单独配置,不支持自由组合。
配置示例:"data_mode": ["all"]。 + MindSpore 静态图场景:L0 级别 dump 仅支持"all"、"forward"和"backward"参数;L2 级别 dump 仅支持"all"、"input"和"output"参数。且各参数只能单独配置,不支持自由组合。
配置示例:"data_mode": ["all"]。 summary_mode控制 dump 文件输出的模式,str 类型,仅 PyTorch 与 MindSpore 动态图场景支持,可选参数:
md5:dump 输出包含 CRC-32 值以及 API 统计信息的 dump.json 文件,用于验证数据的完整性;
statistics:dump 仅输出包含 API 统计信息的 dump.json 文件,默认值。
配置示例:"summary_mode": "md5"。否MindSpore静态图jit_level=O2场景L2级dump,支持上述配置的同时额外支持配置统计项列表,可选统计项为max、min、mean、l2norm,可从中任意选取组合搭配。其中mean、l2norm的结果为float数据格式。
配置示例:"summary_mode": ["max", "min"]。 diff --git a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md index f7507facd2..0ee33b44a8 100644 --- a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md @@ -30,8 +30,10 @@ dump 的"tensor"模式采集数据量大小,可以参考[数据量基线](data ## 5. 场景介绍 -### 5.1 静态图场景 -在静态图场景下,msprobe 仅支持 **L2 Level** 的数据采集。 +### 5.1 静态图场景 +在静态图场景下,msprobe 支持 **L0 Level** 和 **L2 Level** 的数据采集。 +- **L0 Level(Cell 级)** :采集 `Cell` 对象的数据,适用于需要分析特定网络模块的情况。 + - **L2 Level(Kernel 级)** :采集底层算子的输入输出数据,适用于深入分析算子级别的精度问题。 采集方式请参见[示例代码 > 静态图场景](#71-静态图场景)。详细介绍请参见[《config.json 配置文件介绍》](./02.config_introduction.md#11-通用配置)中的“level 参数”和[《config.json 配置示例》](./03.config_examples.md#2-mindspore-静态图场景) 中的“MindSpore 静态图场景”。 @@ -110,7 +112,7 @@ stop() **功能说明**:结束一个 step 的数据采集,完成所有数据落盘并更新 dump 参数。在一个 step 结束的位置添加,且必须在 **stop** 函数之后的位置调用。 该函数需要配合 **start** 和 **stop** 函数使用,尽量添加在反向计算代码之后,否则可能会导致反向数据丢失。 -**仅未使用 Model 高阶 API 的动态图场景支持。** +**仅未使用 Model 高阶 API 的动态图和静态图场景支持。** **原型**: @@ -152,7 +154,7 @@ save(variable, name, save_backward=True) ### 6.2 msprobe.mindspore.common.utils.MsprobeStep -**功能说明**:MindSpore Callback类,自动在每个step开始时调用start()接口,在每个step结束时调用stop()、step()接口。实现使用 Model 高阶 API 的动态图场景下 L0、L1、mix 级别的精度数据采集控制,控制粒度为单个 **Step** ,而 PrecisionDebugger.start, PrecisionDebugger.stop 接口的控制粒度任意训练代码段。 +**功能说明**:MindSpore Callback类,自动在每个step开始时调用start()接口,在每个step结束时调用stop()、step()接口。实现使用 Model 高阶 API 的动态图场景下 L0、L1、mix 级别,和静态图场景下 L0级别的精度数据采集控制,控制粒度为单个 **Step** ,而 PrecisionDebugger.start, PrecisionDebugger.stop 接口的控制粒度任意训练代码段。 **原型**: @@ -188,6 +190,54 @@ seed_all(seed=1234, mode=False, rm_dropout=True) ### 7.1 静态图场景 +#### 7.1.1 L0 级别 + +##### 7.1.1.1 未使用 Model 高阶 API + + +```python +import mindspore as ms +ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + +from msprobe.mindspore import PrecisionDebugger +debugger = PrecisionDebugger(config_path="./config.json") + +# 模型、损失函数的定义以及初始化等操作 +# ... +model = Network() +# 数据集迭代的地方往往是模型开始训练的地方 +for data, label in data_loader: + debugger.start(model) # 进行 L0 级别下Cell 对象的数据采集时调用 + # 如下是模型每个 step 执行的逻辑 + grad_net = ms.grad(model)(data) + # ... + debugger.step() # 更新迭代数 +``` + +##### 7.1.1.2 使用 Model 高阶 API + + +```python +import mindspore as ms +from mindspore.train import Model +ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + +from msprobe.mindspore import PrecisionDebugger +from msprobe.mindspore.common.utils import MsprobeStep +debugger = PrecisionDebugger(config_path="./config.json") + +# 模型、损失函数的定义以及初始化等操作 +# ... + +model = Network() +# 进行 L0 级别下 Cell 对象的数据采集时调用 +debugger.start(model) +trainer = Model(model, loss_fn=loss_fn, optimizer=optimizer, metrics={'accuracy'}) +trainer.train(1, train_dataset, callbacks=[MsprobeStep(debugger)]) +``` + +#### 7.1.2 L2 级别 + ```python import mindspore as ms ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") @@ -301,7 +351,9 @@ trainer.train(1, train_dataset) ### 8.1 静态图场景 -训练结束后,数据将保存在 `dump_path` 指定的目录下。 +训练结束后,数据将保存在 `dump_path` 指定的目录下。
+L0 级别 dump 的目录结构与动态图场景下目录结构一致。
+L2 级别 dump 的目录结构如下所示: 若jit_level=O2,且使用mindstudio-probe发布包或源码编包时添加了`--include-mod=adump`选项,目录结构示例如下: ``` diff --git a/debug/accuracy_tools/msprobe/mindspore/common/const.py b/debug/accuracy_tools/msprobe/mindspore/common/const.py index 067e783842..b41dc5ce01 100644 --- a/debug/accuracy_tools/msprobe/mindspore/common/const.py +++ b/debug/accuracy_tools/msprobe/mindspore/common/const.py @@ -61,6 +61,7 @@ class Const: DROPOUT_API_NAME_PREFIX = "dropout" GRAPH_DATA_MODE_LIST = [CoreConst.ALL, CoreConst.INPUT, CoreConst.OUTPUT] + GRAPH_CELL_DUMP_DATA_MODE_LIST = [CoreConst.ALL, CoreConst.FORWARD, CoreConst.BACKWARD] HOOK_MS_PREFIX_DICT = { OPS_DATA_PREFIX: OPS_PREFIX, diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py index 7694d71dd9..a7082d3e56 100644 --- a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py @@ -34,6 +34,7 @@ from msprobe.mindspore.ms_config import parse_json_config from msprobe.mindspore.runtime import Runtime from msprobe.mindspore.service import Service from msprobe.mindspore.task_handler_factory import TaskHandlerFactory +from msprobe.mindspore.dump.graph_mode_cell_dump import GraphModeCellDump try: from msprobe.lib import _msprobe_c @@ -164,7 +165,7 @@ class PrecisionDebugger: else: if not instance.first_start: api_register.api_set_ori_func() - handler = TaskHandlerFactory.create(instance.config) + handler = TaskHandlerFactory.create(instance.config, model) handler.handle() instance.first_start = True @@ -199,6 +200,9 @@ class PrecisionDebugger: _msprobe_c._PrecisionDebugger().step() if instance.task in PrecisionDebugger.task_not_need_service: return + if instance.config.execution_mode != MsConst.PYNATIVE_MODE and instance.config.level == MsConst.CELL: + GraphModeCellDump.step() + return if instance.service: instance.service.step() HOOKCell.cell_count = defaultdict(int) diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py new file mode 100644 index 0000000000..a21c4590b8 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py @@ -0,0 +1,450 @@ +# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +import re +import json +import atexit +from multiprocessing import Pool + +import numpy as np +import mindspore as ms +from mindspore import nn, ops + +from msprobe.mindspore.common.log import logger +from msprobe.core.common.const import Const as CoreConst +from msprobe.core.common.file_utils import load_npy, save_json, remove_path +from msprobe.core.common.const import FileCheckConst + + +CONSTRUCT_FILE_NAME = "construct.json" +DEFAULT_RANK_DIR = "rank0" +KEY_LAYERS = "layers" +construct = {} +cell_list = [] +KEY_SIDE_EFFECT = "side_effect_io" +td = ops.TensorDump() +td_in = ops.TensorDump("in") +td.add_prim_attr(KEY_SIDE_EFFECT, False) +td_in.add_prim_attr(KEY_SIDE_EFFECT, False) +np_ms_dtype_dict = { + "bool": ms.bool_, + "int8": ms.int8, + "byte": ms.byte, + "int16": ms.int16, + "short": ms.short, + "int32": ms.int32, + "intc": ms.intc, + "int64": ms.int64, + "intp": ms.intp, + "uint8": ms.uint8, + "ubyte": ms.ubyte, + "uint16": ms.uint16, + "ushort": ms.ushort, + "uint32": ms.uint32, + "uintc": ms.uintc, + "uint64": ms.uint64, + "uintp": ms.uintp, + "float16": ms.float16, + "half": ms.half, + "float32": ms.float32, + "single": ms.single, + "float64": ms.float64, + "double": ms.double, + "bfloat16": ms.bfloat16, + "complex64": ms.complex64, + "complex128": ms.complex128 +} + + +def generate_file_path(dump_path, cell_prefix, suffix, io_type, index): + step_path = os.path.join(dump_path, "{step}") + rank_path = os.path.join(step_path, "{rank}") + data_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA) + file_name = CoreConst.SEP.join([cell_prefix, suffix, io_type, str(index)]) + return os.path.join(data_path, file_name) + + +def partial_func(func, dump_path, cell_prefix, index, io_type): + def newfunc(*args, **kwargs): + return func(dump_path, cell_prefix, index, io_type, *args, **kwargs) + return newfunc + + +def clip_gradient(dump_path, cell_prefix, index, io_type, dx): + if io_type == CoreConst.OUTPUT: + temp = td(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx) + dx = ops.depend(dx, temp) + if io_type == CoreConst.INPUT: + temp = td_in(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx) + dx = ops.depend(dx, temp) + return dx + + +def cell_construct_wrapper(func, self): + def new_construct(self, *args, **kwargs): + new_args = [] + out_list = [] + + index = 0 + item = None + # The inputs of the cell. + for index, item in enumerate(args): + if self.data_mode == "backward" or self.data_mode == "all": + if ops.is_tensor(item): + item = self.output_clips[index](item) + if self.data_mode == "forward" or self.data_mode == "all": + if ops.is_tensor(item): + temp = td_in(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.INPUT, index), item) + item = ops.depend(item, temp) + new_args.append(item) + + out = func(*new_args, **kwargs) + + # The outputs of the cell. + if isinstance(out, tuple): + for index, item in enumerate(out): + if self.data_mode == "backward" or self.data_mode == "all": + if ops.is_tensor(item): + item = self.input_clips[index](item) + if self.data_mode == "forward" or self.data_mode == "all": + if ops.is_tensor(item): + temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), item) + item = ops.depend(item, temp) + out_list.append(item) + else: + out_list.append(item) + out_list = tuple(out_list) + return out_list + else: + if self.data_mode == "backward" or self.data_mode == "all": + out = self.input_clips[0](out) + if self.data_mode == "forward" or self.data_mode == "all": + if ops.is_tensor(out): + temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), out) + out = ops.depend(out, temp) + return out + + return new_construct.__get__(self, type(self)) + + +# 获取目录下所有文件名并根据TensorDump落盘自增id从小到大排序 +def sort_filenames(path): + filenames = os.listdir(path) + id_pattern = re.compile(rf'{CoreConst.REPLACEMENT_CHARACTER}(\d+){CoreConst.NUMPY_SUFFIX}$') + filenames.sort(key=lambda x: int(id_pattern.findall(x)[0])) + return filenames + + +# 删除重复dump的文件:自定义文件名相同,并且数据相同 +def del_same_file(path, filenames): + result_list = [] + seen_prefixes = {} + for current_filename in filenames: + parts = current_filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1) + prefix = parts[0] + if prefix not in seen_prefixes: + result_list.append(current_filename) + seen_prefixes[prefix] = current_filename + else: + current_file_path = os.path.join(path, current_filename) + current_file = load_npy(current_file_path) + prev_filename = seen_prefixes[prefix] + prev_file_path = os.path.join(path, prev_filename) + prev_file = load_npy(prev_file_path) + if np.array_equal(current_file, prev_file): + remove_path(current_file_path) + logger.warning(f"{current_file_path} is deleted!") + else: + result_list.append(current_filename) + return result_list + + +def rename_filename(path): + filenames = sort_filenames(path) + filenames = del_same_file(path, filenames) + + filename_dict = {} + for filename in filenames: + name_field = filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)[0] + + if name_field in filename_dict: + filename_dict[name_field] += 1 + else: + filename_dict[name_field] = 0 + + cell_index = filename_dict[name_field] + + # 修改文件名,增加重复调用Cell的序号 + if CoreConst.FORWARD_PATTERN in filename: + #Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy + newFileName = filename.replace(CoreConst.FORWARD_PATTERN, CoreConst.FORWARD_PATTERN + str(cell_index) + CoreConst.SEP) + if CoreConst.BACKWARD_PATTERN in filename: + newFileName = filename.replace(CoreConst.BACKWARD_PATTERN, CoreConst.BACKWARD_PATTERN + str(cell_index) + CoreConst.SEP) + os.rename(os.path.join(path, filename), os.path.join(path, newFileName)) + logger.info(f"==========The rename_filename phase is Finished!==========") + + +# Extract the field between the first "." and the third to last ".", i.e. {cell_name} +def get_cell_name(str): + parts = str.split(CoreConst.SEP) + if len(parts) < 4: + return None + start_index = 1 + end_index = len(parts) - 3 + return CoreConst.SEP.join(parts[start_index:end_index]) + + +# Extract the field between the last "." and the second to last ".", i.e. {data_made} +def get_data_mode(str): + last_dot_index = str.rfind(CoreConst.SEP) + second_last_dot_index = str.rfind(CoreConst.SEP, 0, last_dot_index) + data_mode = str[second_last_dot_index + 1:last_dot_index] + return data_mode + + +# 判断二者之间是否存在父子关系 +def check_relation(cell_name, parent_cell_name): + layers_pattern = rf"{CoreConst.SEP}{KEY_LAYERS}{CoreConst.SEP}\d+$" + last_dot_index = cell_name.rfind(CoreConst.SEP) + if last_dot_index != -1: + # 如果cell_name最后一个'.'之前的字段等于parent_cell_name,则判定存在父子关系 + sub_cell_name = cell_name[:last_dot_index] + if sub_cell_name == parent_cell_name: + return True + elif re.search(layers_pattern, cell_name): + # 如果cell_name以".layer.{layer_id}"结尾,且去掉该字段后等于parent_cell_name,则判定存在父子关系 + sub_cell_name = re.sub(layers_pattern, '', cell_name) + if sub_cell_name == parent_cell_name: + return True + return False + + +def get_construct(cell_list_input): + for cell in cell_list_input: + cell_name = get_cell_name(cell) + cell_data_mode = get_data_mode(cell) + found_flag = False + for parent_cell in cell_list_input: + parent_cell_name = get_cell_name(parent_cell) + parent_data_mode = get_data_mode(parent_cell) + has_relation = check_relation(cell_name, parent_cell_name) + if has_relation and parent_data_mode == cell_data_mode: + construct.update({cell: parent_cell}) + found_flag = True + break + if not found_flag: + construct.update({cell: None}) + + +def generate_construct(path): + global construct + filenames = sort_filenames(path) + + # 提取文件名中Cell.{cell_name}.{class_name}.{data_mode}.{重复调用此cell的序号}字段,并存入cell_list + for filename in filenames: + point_position = 3 + mid_field = filename.rsplit(CoreConst.SEP, point_position)[0] + if CoreConst.INPUT in filename: + if mid_field in cell_list: + cell_list.remove(mid_field) + cell_list.append(mid_field) + else: + if mid_field not in cell_list: + index = filenames.index(filename) + output_field = mid_field + CoreConst.OUTPUT + find_flag = False + for filename_other in cell_list[index + 1:]: + if output_field in filename_other: + find_flag = True + if find_flag is False: + cell_list.append(mid_field) + + get_construct(cell_list) + + # 生成JSON文件 + rank_dir = os.path.dirname(path) + json_path = os.path.join(rank_dir, CONSTRUCT_FILE_NAME) + save_json(json_path, construct, indent=1) + + # 清空'construct'继续处理下一个路径下的数据 + construct = {} + logger.info(f"Construct data saved to {json_path}") + + +def process_file(file_path): + try: + # 读取.npy文件内容 + npy_content = load_npy(file_path) + logger.info(f"Loaded {file_path}: shape is {npy_content.shape}, dtype is {npy_content.dtype}") + + # 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy + parts = os.path.basename(file_path).split(CoreConst.SEP) + data_dtype = "" + # 获取0_float32_165或者0_in_float32_165中的float32 + data_dtype_list = parts[-2].split('_') + if len(data_dtype_list) > 1: + data_dtype = data_dtype_list[-2] + # op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0 + op_name = CoreConst.SEP.join(parts[:-3]) + ms_dtype = np_ms_dtype_dict.get(data_dtype) + if ms_dtype is None: + logger.warning(f"Get dtype None from file {file_path}") + tensor_json = { + CoreConst.TYPE: 'mindspore.Tensor', + CoreConst.DTYPE: str(ms_dtype), + CoreConst.SHAPE: list(npy_content.shape), + CoreConst.MAX: npy_content.max().item(), + CoreConst.MIN: npy_content.min().item(), + CoreConst.MEAN: npy_content.mean().item(), + CoreConst.NORM: np.linalg.norm(npy_content).item(), + CoreConst.DATA_NAME: os.path.basename(file_path) + } + + # 根据文件名的最后一个部分(输入或输出)确定是添加到input_args还是output + if parts[-3] == CoreConst.INPUT: + return op_name, CoreConst.INPUT_ARGS, tensor_json + elif parts[-3] == CoreConst.OUTPUT: + return op_name, CoreConst.OUTPUT, tensor_json + else: + return None, None, None + + except Exception as e: + logger.error(f"Error reading {file_path}: {e}") + return None, None, None + + +def custom_sort(item, key_to_index): + key = item[0] + return key_to_index.get(key, float('inf')) + + +def generate_dump_info(path): + if not os.path.exists(path): + logger.error("The provided path does not exist.") + return + + dump_data = {"task": "tensor", "level": "L0", "dump_data_dir": path, "data": {}} + + with Pool(processes=10) as pool: + file_paths = [] + for root, _, files in os.walk(path): + for file in files: + if file.endswith(FileCheckConst.NUMPY_SUFFIX): + file_paths.append((os.path.join(root, file),)) + file_paths.sort() + results = pool.starmap(process_file, file_paths) + + # 收集结果 + for op_name, key, tensor_json in results: + if op_name: + if op_name not in dump_data.get(CoreConst.DATA, {}): + dump_data.get(CoreConst.DATA, {})[op_name] = {CoreConst.INPUT_ARGS: [], + CoreConst.INPUT_KWARGS: {}, + CoreConst.OUTPUT: []} + if key not in dump_data.get(CoreConst.DATA, {}).get(op_name, {}): + dump_data.get(CoreConst.DATA, {}).get(op_name, {})[key] = [] + dump_data.get(CoreConst.DATA, {}).get(op_name, {}).get(key, []).append(tensor_json) + + # 根据cell_list排序 + data_dict = dump_data.get(CoreConst.DATA, {}) + key_to_index = {key: index for index, key in enumerate(cell_list)} + sorted_data_dict = dict(sorted(data_dict.items(), key=lambda item: custom_sort(item, key_to_index))) + dump_data[CoreConst.DATA] = sorted_data_dict + + # 将数据写入dump.json + json_path = os.path.join(os.path.dirname(path), 'dump.json') + save_json(json_path, dump_data, indent=1) + + logger.info(f"Dump data saved to {json_path}") + + +def generate_stack_info(path): + if not os.path.exists(path): + logger.error("The provided path does not exist.") + return + + stack_data = {} + file_paths = [] + # 传入的path为工具生成的./dump_tensor_data,内容为npy文件 + for root, _, files in os.walk(path): + for file in files: + if file.endswith(FileCheckConst.NUMPY_SUFFIX): + file_paths.append(os.path.join(root, file)) + file_paths.sort() + for file_path in file_paths: + # 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy + parts = os.path.basename(file_path).split(CoreConst.SEP) + # op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0 + op_name = CoreConst.SEP.join(parts[:-3]) + stack_data.update({op_name: []}) + + # 将数据写入stack.json + json_path = os.path.join(os.path.dirname(path), 'stack.json') + save_json(json_path, stack_data, indent=1) + + logger.info(f"Stack data saved to {json_path}") + + +def process(dump_path): + logger.info(f"==========Start processing data that has already been stored on the disk!==========") + rank_id = os.environ.get('RANK_ID') + rank_dir = DEFAULT_RANK_DIR + if rank_id is not None: + rank_dir = CoreConst.RANK + str(rank_id) + + step_dir_list = os.listdir(dump_path) + for step_dir in step_dir_list: + step_path = os.path.join(dump_path, step_dir) + rank_path = os.path.join(step_path, rank_dir) + npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA) + rename_filename(npy_path) + generate_construct(npy_path) + generate_dump_info(npy_path) + generate_stack_info(npy_path) + + +def start(net=None, dump_path="./", data_mode=CoreConst.ALL): + if net is None: + return + + black_list = ["grad_reducer", ""] + for name, cell in net.cells_and_names(): + class_name = cell.__class__.__name__ + # 跳过黑名单cell + if name in black_list: + logger.info(f"Cell {name}.{class_name} is skipped!") + continue + # 跳过框架内部的cell + if class_name.startswith(CoreConst.REPLACEMENT_CHARACTER): + logger.info(f"Cell {name}.{class_name} is skipped!") + continue + else: + #Format: Cell.{cell_name}.{class_name} + cell.cell_prefix = CoreConst.SEP.join([CoreConst.CELL, name, cell.__class__.__name__]) + + cell.construct = cell_construct_wrapper(cell.construct, cell) + logger.info(f"Cell {name}: construct function is wrapped!") + cell.dump_path = dump_path + cell.data_mode = data_mode + cell.input_clips = [] + cell.output_clips = [] + # It is assumed that each cell has a maximum of 50 outputs and 50 inputs. + for i in range(50): + cell.input_clips.append(ops.InsertGradientOf(partial_func(clip_gradient, cell.dump_path, cell.cell_prefix, i, CoreConst.INPUT))) + cell.output_clips.append(ops.InsertGradientOf(partial_func(clip_gradient, cell.dump_path, cell.cell_prefix, i, CoreConst.OUTPUT))) + + logger.info(f"==========The cell_dump_process_start phase is Finished!==========") + atexit.register(process, dump_path=dump_path) diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py index 0ca63b4a84..c0933d20aa 100644 --- a/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/dump/dump_tool_factory.py @@ -17,13 +17,14 @@ from msprobe.mindspore.common.const import Const from msprobe.mindspore.debugger.debugger_config import DebuggerConfig from msprobe.mindspore.dump.kernel_graph_dump import KernelGraphDump from msprobe.mindspore.dump.kernel_kbyk_dump import KernelKbykDump +from msprobe.mindspore.dump.graph_mode_cell_dump import GraphModeCellDump class DumpToolFactory: tools = { Const.CELL: { - Const.GRAPH_KBYK_MODE: None, - Const.GRAPH_GE_MODE: None, + Const.GRAPH_KBYK_MODE: GraphModeCellDump, + Const.GRAPH_GE_MODE: GraphModeCellDump, Const.PYNATIVE_MODE: None }, Const.API: { @@ -39,9 +40,13 @@ class DumpToolFactory: } @staticmethod - def create(config: DebuggerConfig): - if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_DATA_MODE_LIST: - raise Exception("data_mode must be one of all, input, output.") + def create(config: DebuggerConfig, model): + if config.level == Const.CELL: + if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_CELL_DUMP_DATA_MODE_LIST: + raise Exception("data_mode must be one of all, forward, backward.") + else: + if len(config.data_mode) != 1 or config.data_mode[0] not in Const.GRAPH_DATA_MODE_LIST: + raise Exception("data_mode must be one of all, input, output.") tool = DumpToolFactory.tools.get(config.level) if not tool: raise Exception("Valid level is needed.") @@ -49,4 +54,4 @@ class DumpToolFactory: if not tool: raise Exception(f"Data dump is not supported in {config.execution_mode} mode " f"when dump level is {config.level}.") - return tool(config) + return tool(config, model) if tool == GraphModeCellDump else tool(config) diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py b/debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py new file mode 100644 index 0000000000..e32866868f --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/dump/graph_mode_cell_dump.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from msprobe.mindspore.common.log import logger +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +import mindspore as ms +from mindspore._c_expression import _tensordump_set_step +from mindspore.ops.primitive import _run_op +from mindspore import hal, ops +import msprobe.mindspore.dump.cell_dump_process as cellDumper +from msprobe.mindspore.common.const import Const + + +class GraphModeCellDump: + def __init__(self, config: DebuggerConfig, model): + self.net = model + self.white_list = [] + self.black_list = [] + self.dump_path = config.dump_path if config.dump_path else "./" + self.rank = config.rank + self.step = config.step + self.scope = config.scope + self.list = config.list + self.data_mode = config.data_mode + self.file_format = config.file_format + self.check_config() + self.set_step() + + @staticmethod + def step(): + hal.synchronize() + temp_tensor = ms.Tensor([1], dtype=ms.float32) + step_flag = "" + _run_op(ops.TensorDump(), "TensorDump", (step_flag, temp_tensor)) + ops.tensordump(step_flag, temp_tensor) + + def check_config(self): + if self.rank != []: + raise Exception("In graph mode, cell dump does not currently support specifying rank.") + if self.scope != []: + raise Exception("In graph mode, cell dump does not currently support specifying scope.") + if self.list != []: + raise Exception("In graph mode, cell dump does not currently support specifying list.") + if len(self.data_mode) != 1 or self.data_mode[0] not in Const.GRAPH_CELL_DUMP_DATA_MODE_LIST: + raise Exception("In graph mode and cell dump, data_mode must be one of all, forword, backword.") + if self.file_format != []: + logger.warning("In graph mode, cell dump does not currently support specifying file_format. The file will be stored in npy format.") + if not self.net: + raise Exception("The model is empty and cell dump is not enabled.") + return True + + def set_step(self): + _tensordump_set_step(self.step) + + def handle(self): + os.environ['MS_JIT_MODULES'] = 'msprobe' + cellDumper.start(net=self.net, dump_path=self.dump_path, data_mode=self.data_mode[0]) diff --git a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py index a9cb5e6dd4..5cfbbaeb4a 100644 --- a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py @@ -29,11 +29,14 @@ class TaskHandlerFactory: } @staticmethod - def create(config: DebuggerConfig): + def create(config: DebuggerConfig, model): task = TaskHandlerFactory.tasks.get(config.task) if not task: raise Exception("Valid task is needed.") - handler = task.create(config) + if task == DumpToolFactory: + handler = task.create(config, model) + else: + handler = task.create(config) if not handler: raise Exception("Can not find task handler") return handler diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py new file mode 100644 index 0000000000..b111e64437 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/debugger/test_graph_cell_dump.py @@ -0,0 +1,309 @@ +# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import re +import unittest +from unittest.mock import MagicMock, patch + +import mindspore as ms +from mindspore import ops + +from msprobe.core.common.const import Const as CoreConst +from msprobe.mindspore.dump.cell_dump_process import generate_file_path +from msprobe.mindspore.dump.cell_dump_process import partial_func, clip_gradient +from msprobe.mindspore.dump.cell_dump_process import cell_construct_wrapper +from msprobe.mindspore.dump.cell_dump_process import rename_filename, sort_filenames, del_same_file +from msprobe.mindspore.dump.cell_dump_process import check_relation + + +class TestGenerateFilePath(unittest.TestCase): + def setUp(self): + self.dump_path = "/path" + self.cell_prefix = "Cell.network._backbone.LlamaForCausalLM" + self.suffix = "forward" + self.io_type = "input" + self.index = 0 + + def test_generate_file_path(self): + expected_path = os.path.join( + self.dump_path, + "{step}", + "{rank}", + CoreConst.DUMP_TENSOR_DATA, + CoreConst.SEP.join([self.cell_prefix, self.suffix, self.io_type, str(self.index)]) + ) + result = generate_file_path(self.dump_path, self.cell_prefix, self.suffix, self.io_type, self.index) + self.assertEqual(result, expected_path) + + +class TestPartialFunc(unittest.TestCase): + + @patch('msprobe.mindspore.dump.cell_dump_process.CoreConst') + @patch('msprobe.mindspore.dump.cell_dump_process.td') + @patch('msprobe.mindspore.dump.cell_dump_process.td_in') + @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path') + @patch('msprobe.mindspore.dump.cell_dump_process.ops.depend') + def test_clip_gradient_output(self, mock_depend, mock_generate_file_path, mock_td_in, mock_td, mock_CoreConst): + mock_CoreConst.OUTPUT = "output" + mock_CoreConst.BACKWARD = "backward" + mock_generate_file_path.return_value = "mock_path" + mock_td.return_value = "temp_tensor" + mock_depend.return_value = "dependent_tensor" + + result = clip_gradient("dump_path", "cell_prefix", 0, "output", "dx") + + mock_generate_file_path.assert_called_with("dump_path", "cell_prefix", "backward", "output", 0) + mock_td.assert_called_with("mock_path", "dx") + mock_depend.assert_called_with("dx", "temp_tensor") + self.assertEqual(result, "dependent_tensor") + + @patch('msprobe.mindspore.dump.cell_dump_process.CoreConst') + @patch('msprobe.mindspore.dump.cell_dump_process.td') + @patch('msprobe.mindspore.dump.cell_dump_process.td_in') + @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path') + @patch('msprobe.mindspore.dump.cell_dump_process.ops.depend') + def test_clip_gradient_input(self, mock_depend, mock_generate_file_path, mock_td_in, mock_td, mock_CoreConst): + mock_CoreConst.INPUT = "input" + mock_CoreConst.BACKWARD = "backward" + mock_generate_file_path.return_value = "mock_path" + mock_td_in.return_value = "temp_tensor" + mock_depend.return_value = "dependent_tensor" + + result = clip_gradient("dump_path", "cell_prefix", 0, "input", "dx") + + mock_generate_file_path.assert_called_with("dump_path", "cell_prefix", "backward", "input", 0) + mock_td_in.assert_called_with("mock_path", "dx") + mock_depend.assert_called_with("dx", "temp_tensor") + self.assertEqual(result, "dependent_tensor") + + def test_partial_func(self): + def mock_func(dump_path, cell_prefix, index, io_type, *args, **kwargs): + return dump_path, cell_prefix, index, io_type, args, kwargs + + new_func = partial_func(mock_func, "dump_path", "cell_prefix", 0, "io_type") + result = new_func("arg1", "arg2", kwarg1="value1") + + self.assertEqual(result, ("dump_path", "cell_prefix", 0, "io_type", ("arg1", "arg2"), {'kwarg1': 'value1'})) + + +class TestCellWrapperProcess(unittest.TestCase): + + @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path') + @patch('msprobe.mindspore.dump.cell_dump_process.td') + @patch('msprobe.mindspore.dump.cell_dump_process.td_in') + def test_cell_construct_wrapper(self, mock_td_in, mock_td, mock_generate_file_path): + # Mock the generate_file_path function + mock_generate_file_path.return_value = "mock_path" + + # Mock the TensorDump operations + mock_td.return_value = MagicMock() + mock_td_in.return_value = MagicMock() + + # Create a mock cell with necessary attributes + mock_cell = MagicMock() + mock_cell.data_mode = "all" + mock_cell.dump_path = "mock_dump_path" + mock_cell.cell_prefix = "mock_cell_prefix" + mock_cell.input_clips = [MagicMock() for _ in range(50)] + mock_cell.output_clips = [MagicMock() for _ in range(50)] + + # Define a mock function to wrap + def mock_func(*args, **kwargs): + return args + + # Wrap the mock function using cell_construct_wrapper + wrapped_func = cell_construct_wrapper(mock_func, mock_cell) + + # Create mock inputs + mock_input = ms.Tensor([1, 2, 3]) + mock_args = (mock_input,) + + # Call the wrapped function + result = wrapped_func(mock_cell, *mock_args) + + # Check if the result is as expected + self.assertEqual(result, mock_args) + + # Verify that the TensorDump operations were called + mock_td_in.assert_called() + mock_td.assert_called() + + @patch('msprobe.mindspore.dump.cell_dump_process.generate_file_path') + @patch('msprobe.mindspore.dump.cell_dump_process.td') + @patch('msprobe.mindspore.dump.cell_dump_process.td_in') + def test_cell_construct_wrapper_with_tuple_output(self, mock_td_in, mock_td, mock_generate_file_path): + # Mock the generate_file_path function + mock_generate_file_path.return_value = "mock_path" + + # Mock the TensorDump operations + mock_td.return_value = MagicMock() + mock_td_in.return_value = MagicMock() + + # Create a mock cell with necessary attributes + mock_cell = MagicMock() + mock_cell.data_mode = "all" + mock_cell.dump_path = "mock_dump_path" + mock_cell.cell_prefix = "mock_cell_prefix" + mock_cell.input_clips = [MagicMock() for _ in range(50)] + mock_cell.output_clips = [MagicMock() for _ in range(50)] + + # Define a mock function to wrap + def mock_func(*args, **kwargs): + return (args[0], args[0]) + + # Wrap the mock function using cell_construct_wrapper + wrapped_func = cell_construct_wrapper(mock_func, mock_cell) + + # Create mock inputs + mock_input = ms.Tensor([1, 2, 3]) + mock_args = (mock_input,) + + # Call the wrapped function + result = wrapped_func(mock_cell, *mock_args) + + # Check if the result is as expected + self.assertEqual(result, (mock_input, mock_input)) + + # Verify that the TensorDump operations were called + mock_td_in.assert_called() + mock_td.assert_called() + + +class TestSortFilenames(unittest.TestCase): + + @patch('os.listdir') + def test_sort_filenames(self, mock_listdir): + # Mock the list of filenames returned by os.listdir + mock_listdir.return_value = [ + 'Cell.network._backbone.model.LlamaModel.backward.0.input.0_float16_177.npy', + 'Cell.network._backbone.model.LlamaModel.forward.0.input.0_in_int32_1.npy', + 'Cell.network._backbone.model.LlamaModel.forward.0.output.10_float16_165.npy', + 'Cell.network._backbone.model.norm_out.LlamaRMSNorm.backward.0.input.0_float16_178.npy' + ] + + # Mock the CoreConst values + CoreConst.REPLACEMENT_CHARACTER = '_' + CoreConst.NUMPY_SUFFIX = '.npy' + + # Expected sorted filenames + expected_sorted_filenames = [ + 'Cell.network._backbone.model.LlamaModel.forward.0.input.0_in_int32_1.npy', + 'Cell.network._backbone.model.LlamaModel.forward.0.output.10_float16_165.npy', + 'Cell.network._backbone.model.LlamaModel.backward.0.input.0_float16_177.npy', + 'Cell.network._backbone.model.norm_out.LlamaRMSNorm.backward.0.input.0_float16_178.npy' + ] + + # Call the function + sorted_filenames = sort_filenames('/mock/path') + + # Assert the filenames are sorted correctly + self.assertEqual(sorted_filenames, expected_sorted_filenames) + + +class TestRenameFilename(unittest.TestCase): + + @patch('msprobe.mindspore.dump.cell_dump_process.sort_filenames') + @patch('msprobe.mindspore.dump.cell_dump_process.del_same_file') + @patch('msprobe.mindspore.dump.cell_dump_process.os.rename') + def test_rename_filename(self, mock_rename, mock_del_same_file, mock_sort_filenames): + # Mock the constants + CoreConst.REPLACEMENT_CHARACTER = '_' + CoreConst.FORWARD_PATTERN = '.forward.' + CoreConst.BACKWARD_PATTERN = '.backward.' + CoreConst.SEP = '.' + + # Mock the filenames + mock_sort_filenames.return_value = [ + "Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_101.npy", + "Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_102.npy", + "Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.0_float32_103.npy", + "Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.1_bool_104.npy", + "Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.output.1_bool_105.npy", + "Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_111.npy", + "Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_112.npy", + ] + mock_del_same_file.return_value = [mock_sort_filenames.return_value] + + # Call the function + rename_filename('/mock/path') + + # Check if os.rename was called with the correct arguments + mock_rename.assert_any_call( + '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_101.npy', + '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.0.input_0_int32_101.npy' + ) + mock_rename.assert_any_call( + '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_102.npy', + '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.0.output_0_float32_102.npy' + ) + mock_rename.assert_any_call( + '/mock/path/Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.0_float32_103.npy', + '/mock/path/Cell_loss_scaling_manager_DynamicLossScaleUpdateCell.backward.0.input_0_float32_103.npy' + ) + mock_rename.assert_any_call( + '/mock/path/Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.input.1_bool_104.npy', + '/mock/path/Cell_loss_scaling_manager_DynamicLossScaleUpdateCell.backward.0.input_1_bool_104.npy' + ) + mock_rename.assert_any_call( + '/mock/path/Cell.loss_scaling_manager.DynamicLossScaleUpdateCell.backward.output.1_bool_105.npy', + '/mock/path/Cell_loss_scaling_manager_DynamicLossScaleUpdateCell.backward.0.output_1_bool_105.npy' + ) + mock_rename.assert_any_call( + '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.input.0_int32_111.npy', + '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.1.input_0_int32_111.npy' + ) + mock_rename.assert_any_call( + '/mock/path/Cell.learning_rate.CosineWithWarmUpLR.forward.output.0_float32_112.npy', + '/mock/path/Cell_learning_rate_CosineWithWarmUpLR.forward.1.output_0_float32_112.npy' + ) + + # Mock the filenames + mock_sort_filenames.return_value = [] + mock_del_same_file.return_value = [] + + # Call the function + rename_filename('/mock/path') + + # Check if os.rename was not called + mock_rename.assert_not_called() + + +class TestCheckRelation(unittest.TestCase): + + def setUp(self): + CoreConst.SEP = '.' + global KEY_LAYERS + KEY_LAYERS = "layers" + + def test_direct_parent_child_relation(self): + self.assertTrue(check_relation("network._backbone", "network")) + self.assertTrue(check_relation("network._backbone.model", "network._backbone")) + + def test_no_relation(self): + self.assertFalse(check_relation("network._backbone", "network.loss")) + self.assertFalse(check_relation("network._backbone.model", "network.loss")) + + def test_layer_pattern_relation(self): + self.assertTrue(check_relation("network.model.layers.0", "network.model")) + self.assertTrue(check_relation("network._backbone.model.layers.1", "network._backbone.model")) + + def test_no_layer_pattern_relation(self): + self.assertFalse(check_relation("network.model.layers.0", "network.loss")) + self.assertFalse(check_relation("network._backbone.model.layers.1", "network._backbone.model.layers")) + + def test_edge_cases(self): + self.assertFalse(check_relation("", "network")) + self.assertFalse(check_relation("network.layer1", "")) + self.assertFalse(check_relation("", "")) -- Gitee From 3ac2ba51aefc8800092cb5f4b064f5a31389ba73 Mon Sep 17 00:00:00 2001 From: fuchao <1501312275@qq.com> Date: Mon, 17 Feb 2025 18:18:33 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BC=98=E5=8C=96=E9=9D=99=E6=80=81?= =?UTF-8?q?=E5=9B=BEcell=E7=BA=A7dump=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../mindspore/dump/cell_dump_process.py | 50 +++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py index a21c4590b8..a9121e1435 100644 --- a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py +++ b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py @@ -133,7 +133,7 @@ def cell_construct_wrapper(func, self): out = self.input_clips[0](out) if self.data_mode == "forward" or self.data_mode == "all": if ops.is_tensor(out): - temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), out) + temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, 0), out) out = ops.depend(out, temp) return out @@ -302,6 +302,21 @@ def process_file(file_path): ms_dtype = np_ms_dtype_dict.get(data_dtype) if ms_dtype is None: logger.warning(f"Get dtype None from file {file_path}") + + #修改落盘文件名字,去掉TensorDump自带的数据类型和自增id字段 + data_file_name = os.path.basename(file_path) + data_file_dir = os.path.dirname(file_path) + parts = data_file_name.split(CoreConst.SEP) + if len(parts) >= 2: + param_index = parts[-2].split(CoreConst.REPLACEMENT_CHARACTER)[0] + pre_parts = CoreConst.SEP.join(parts[:-2]) + new_file_name = pre_parts + CoreConst.SEP + param_index + CoreConst.NUMPY_SUFFIX + os.rename(os.path.join(data_file_dir, data_file_name), os.path.join(data_file_dir, new_file_name)) + logger.info(f"{data_file_name} is renamed to {new_file_name}") + else: + logger.warning(f"Failed to rename {data_file_name}.") + new_file_name = data_file_name + tensor_json = { CoreConst.TYPE: 'mindspore.Tensor', CoreConst.DTYPE: str(ms_dtype), @@ -310,7 +325,7 @@ def process_file(file_path): CoreConst.MIN: npy_content.min().item(), CoreConst.MEAN: npy_content.mean().item(), CoreConst.NORM: np.linalg.norm(npy_content).item(), - CoreConst.DATA_NAME: os.path.basename(file_path) + CoreConst.DATA_NAME: new_file_name } # 根据文件名的最后一个部分(输入或输出)确定是添加到input_args还是output @@ -398,8 +413,28 @@ def generate_stack_info(path): logger.info(f"Stack data saved to {json_path}") +def is_download_finished(directory, interval=3): + """ + 判断指定目录在一段时间后是否有数据被下载完成 + :param directory: 指定目录的路径 + :param interval: 检查的时间间隔(秒),默认为 3 秒 + :return: 如有数据被下载完成返回 True,否则返回 False + """ + # 检查目录是否存在 + if not os.path.exists(directory): + logger.warning(f"The specified directory {directory} does not exist.") + return False + initial_modification_time = os.path.getmtime(directory) + time.sleep(interval) + current_modification_time = os.path.getmtime(directory) + # 比较初始和当前修改时间 + if current_modification_time > initial_modification_time: + return False + else: + return True + + def process(dump_path): - logger.info(f"==========Start processing data that has already been stored on the disk!==========") rank_id = os.environ.get('RANK_ID') rank_dir = DEFAULT_RANK_DIR if rank_id is not None: @@ -410,10 +445,19 @@ def process(dump_path): step_path = os.path.join(dump_path, step_dir) rank_path = os.path.join(step_path, rank_dir) npy_path = os.path.join(rank_path, CoreConst.DUMP_TENSOR_DATA) + while True: + is_finished = is_download_finished(npy_path) + if not is_finished: + logger.info(f"There is data being downloaded in the specified directory, continue checking...") + else: + logger.info(f"There is no data being downloaded in the specified directory, Stop checking.") + break + logger.info(f"==========Start processing data that has already been stored on the disk!==========") rename_filename(npy_path) generate_construct(npy_path) generate_dump_info(npy_path) generate_stack_info(npy_path) + logger.info(f"==========JSON file generation completed!==========") def start(net=None, dump_path="./", data_mode=CoreConst.ALL): -- Gitee