From 245cc7562bbc4d1a966be69005114718da9b3128 Mon Sep 17 00:00:00 2001 From: qqqhhhbbb Date: Sat, 26 Jul 2025 10:32:36 +0800 Subject: [PATCH] add exception dump --- .../msprobe/core/common/const.py | 3 +- .../msprobe/docs/02.config_introduction.md | 11 +++- .../msprobe/docs/03.config_examples.md | 24 ++++++++ .../msprobe/docs/06.data_dump_MindSpore.md | 2 +- .../mindspore/debugger/precision_debugger.py | 2 + .../mindspore/exception_dump/__init__.py | 0 .../exception_dump_tool_factory.py | 51 +++++++++++++++++ .../kernel_graph_exception_dump.py | 57 +++++++++++++++++++ .../msprobe/mindspore/ms_config.py | 9 ++- .../overflow_check_tool_factory.py | 2 +- .../msprobe/mindspore/task_handler_factory.py | 4 +- 11 files changed, 159 insertions(+), 6 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 5a5c779ff..45901df3f 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -115,9 +115,10 @@ class Const: RUN_UT = "run_ut" GRAD_PROBE = "grad_probe" STRUCTURE = "structure" + EXCEPTION_DUMP = "exception_dump" DUMP_PRECISION_HIGH = "high" DUMP_PRECISION_LOW = "low" - TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE] + TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE, EXCEPTION_DUMP] DUMP_DATA_COLLECTION_LIST = [STATISTICS, TENSOR, STRUCTURE] DUMP_DATA_MODE_LIST = [ALL, INPUT, OUTPUT, FORWARD, BACKWARD] DUMP_PRECISION_LIST = [DUMP_PRECISION_LOW, DUMP_PRECISION_HIGH] diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md index e4b270498..bd476d8fa 100644 --- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md +++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md @@ -12,7 +12,7 @@ | 参数 | 解释 | 是否必选 | |-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | -| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对,不支持 MSAdapter 场景;
"grad_probe":梯度监控, 不支持 MSAdapter 场景;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe),
[1.8 task 配置为 structure](#18-task-配置为-structure)。
**配置示例**:"task": "tensor"。 | 否 | +| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对,不支持 MSAdapter 场景;
"grad_probe":梯度监控, 不支持 MSAdapter 场景;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe),
[1.8 task 配置为 structure](#18-task-配置为-structure),
[1.9 task 配置为 exception_dump](#19-task-配置为-exception_dump)。
**配置示例**:"task": "tensor"。 | 否 | | dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 | | rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。静态图 L0 级别 dump 暂不支持指定rank。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 | | step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 | @@ -210,3 +210,12 @@ structure 模式仅采集模型结构,无其他特殊配置。 - [PyTorch场景](03.config_examples.md#16-task-配置为-structure) - [MindSpore动态图场景](03.config_examples.md#35-task-配置为-structure) +### 1.9 task 配置为 exception_dump +MindSpore 动态图场景下,"level"须为"L2"; MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O0"或"O1"。 + +在运行过程中会在指定目录下生成kernel_graph_exception_dump.json的中间文件,该文件包含异常dump的相关设置。 +除中间文件外的其他 dump 结果文件请参见 MindSpore 官方文档中的[ Ascend 下 O0/O1 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D) + +**示例**: + - [MindSpore动态图场景](03.config_examples.md#36-task-配置为-exception_dump) + - [MindSpore静态图场景](03.config_examples.md#24-task-配置为-exception_dump) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/docs/03.config_examples.md b/debug/accuracy_tools/msprobe/docs/03.config_examples.md index fe4112254..6350099be 100644 --- a/debug/accuracy_tools/msprobe/docs/03.config_examples.md +++ b/debug/accuracy_tools/msprobe/docs/03.config_examples.md @@ -168,6 +168,18 @@ } ``` +### 2.4 task 配置为 exception_dump + +```json +{ + "task": "exception_dump", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L2" +} +``` + ## 3 MindSpore 动态图场景 ### 3.1 task 配置为 statistics @@ -255,3 +267,15 @@ "level": "mix" } ``` + +### 3.6 task 配置为 exception_dump + +```json +{ + "task": "exception_dump", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L2" +} +``` diff --git a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md index 838bd130a..6a51beac6 100644 --- a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md @@ -508,7 +508,7 @@ L2 级别 dump 的目录结构如下所示: 2. 若原始文件全名长度超过255个字符,则文件基础名会被转换为长度为32位的随机数字字符串,原始文件名与转换后文件名的对应关系会保存在同目录下的`mapping.csv`文件中。 3. acl_dump_{device_id}.json 为在 Dump 接口调用过程中生成的中间文件,一般情况下无需关注。 -其他场景下,除 kernel_kbyk_dump.json(jit_level=O0/O1)、kernel_graph_dump.json(jit_level=O2)等无需关注的中间文件外的其他 dump 结果文件请参见 MindSpore 官方文档中的[ Ascend 下 O0/O1 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D)与[ Ascend 下 O2 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D-1)。 +其他场景下,除 kernel_kbyk_dump.json(jit_level=O0/O1)、kernel_graph_dump.json(jit_level=O2)等无需关注的中间文件外的其他 dump 结果文件请参见 MindSpore 官方文档中的[ Ascend 下 O0/O1 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D) ### 8.2 动态图场景 dump 结果目录结构示例如下: diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py index eca71a475..99aeeba89 100644 --- a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py @@ -43,6 +43,7 @@ from msprobe.mindspore.task_handler_factory import TaskHandlerFactory try: from mindspore._c_expression import _dump_start, _dump_stop, _dump_step, _set_init_iter, _dump_set_dynamic + import mindspore as ms except ImportError: enable_dynamic_kbyk_dump = False else: @@ -166,6 +167,7 @@ class PrecisionDebugger(BasePrecisionDebugger): else: Runtime.is_running = False if enable_dynamic_kbyk_dump and instance.config.level_ori == Const.LEVEL_L2: + ms.runtime.synchronize() _dump_stop() if cls._is_kernel_dump() and _msprobe_c: _msprobe_c._PrecisionDebugger().stop() diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py new file mode 100644 index 000000000..db55811de --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py @@ -0,0 +1,51 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from msprobe.core.common.log import logger +from msprobe.mindspore.common.const import Const +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.exception_dump.kernel_graph_exception_dump import KernelGraphExceptionDump + + +class ExceptionDumpToolFactory: + tools = { + Const.CELL: { + Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: None + }, + Const.API: { + Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: None + }, + Const.KERNEL: { + Const.GRAPH_KBYK_MODE: KernelGraphExceptionDump, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: KernelGraphExceptionDump + } + } + + @staticmethod + def create(config: DebuggerConfig): + tool = ExceptionDumpToolFactory.tools.get(config.level) + if not tool: + raise Exception("Valid level is needed.") + tool = tool.get(config.execution_mode) + if not tool: + logger.error(f"Exception dump is not supported in {config.execution_mode} mode " + f"when level is {config.level}.") + raise ValueError + return (tool(config),) diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py new file mode 100644 index 000000000..d9c4e6f72 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from msprobe.core.common.file_utils import create_directory, save_json +from msprobe.mindspore.common.log import logger +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig + + +class KernelGraphExceptionDump: + + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "tensor" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7] + self.dump_json["common_dump_settings"]["op_debug_mode"] = 4 + self.dump_json["common_dump_settings"]["file_format"] = "npy" + self.dump_json["e2e_dump_settings"] = dict() + self.dump_json["e2e_dump_settings"]["enable"] = not config.async_dump + self.dump_json["e2e_dump_settings"]["trans_flag"] = True + + if config.stat_cal_mode and config.device_stat_precision_mode: + self.dump_json["e2e_dump_settings"]["stat_calc_mode"] = config.stat_cal_mode + self.dump_json["e2e_dump_settings"]["device_stat_precision_mode"] = config.device_stat_precision_mode + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + logger.warning("Step would change to all in this task.") + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + + def handle(self): + json_path = self.dump_json["common_dump_settings"]["path"] + create_directory(json_path) + json_path = os.path.join(json_path, "kernel_graph_exception_check.json") + save_json(json_path, self.dump_json, indent=4) + logger.info(json_path + " has been created.") + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path diff --git a/debug/accuracy_tools/msprobe/mindspore/ms_config.py b/debug/accuracy_tools/msprobe/mindspore/ms_config.py index 2c27b05aa..ab400771f 100644 --- a/debug/accuracy_tools/msprobe/mindspore/ms_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/ms_config.py @@ -79,6 +79,12 @@ class OverflowCheckConfig(BaseConfig): raise Exception("check_mode is invalid") +class ExceptionDumpConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.data_mode = ["all"] + + class FreeBenchmarkConfig(BaseConfig): def __init__(self, task_config): super().__init__(task_config) @@ -128,7 +134,8 @@ TaskDict = { Const.OVERFLOW_CHECK: OverflowCheckConfig, Const.FREE_BENCHMARK: FreeBenchmarkConfig, Const.GRAD_PROBE: GradProbeConfig, - Const.STRUCTURE: StructureConfig + Const.STRUCTURE: StructureConfig, + Const.EXCEPTION_DUMP: ExceptionDumpConfig } diff --git a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py index 0b3ed6221..bae3104c2 100644 --- a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py @@ -32,7 +32,7 @@ class OverflowCheckToolFactory: Const.PYNATIVE_MODE: None }, Const.KERNEL: { - Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_KBYK_MODE: KernelGraphOverflowCheck, Const.GRAPH_GE_MODE: KernelGraphOverflowCheck, Const.PYNATIVE_MODE: None } diff --git a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py index 10b74ea22..cad37cebe 100644 --- a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py @@ -18,6 +18,7 @@ from msprobe.mindspore.debugger.debugger_config import DebuggerConfig from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory from msprobe.mindspore.free_benchmark.self_check_tool_factory import SelfCheckToolFactory +from msprobe.mindspore.exception_dump.exception_dump_tool_factory import ExceptionDumpToolFactory class TaskHandlerFactory: @@ -25,7 +26,8 @@ class TaskHandlerFactory: Const.TENSOR: DumpToolFactory, Const.STATISTICS: DumpToolFactory, Const.OVERFLOW_CHECK: OverflowCheckToolFactory, - Const.FREE_BENCHMARK: SelfCheckToolFactory + Const.FREE_BENCHMARK: SelfCheckToolFactory, + Const.EXCEPTION_DUMP: ExceptionDumpToolFactory } @staticmethod -- Gitee