diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py
index c430bcf3cd6c3bc2945ae551f95592e024e481c6..70989378dab6b9316c4212dfb1a91f6c7bcc18db 100644
--- a/debug/accuracy_tools/msprobe/core/common/const.py
+++ b/debug/accuracy_tools/msprobe/core/common/const.py
@@ -114,9 +114,10 @@ class Const:
RUN_UT = "run_ut"
GRAD_PROBE = "grad_probe"
STRUCTURE = "structure"
+ EXCEPTION_DUMP = "exception_dump"
DUMP_PRECISION_HIGH = "high"
DUMP_PRECISION_LOW = "low"
- TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE]
+ TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE, EXCEPTION_DUMP]
DUMP_DATA_COLLECTION_LIST = [STATISTICS, TENSOR, STRUCTURE]
DUMP_DATA_MODE_LIST = [ALL, INPUT, OUTPUT, FORWARD, BACKWARD]
DUMP_PRECISION_LIST = [DUMP_PRECISION_LOW, DUMP_PRECISION_HIGH]
diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index 7b70cd859ebca13db471837c469e7661e0a774f2..d592181acb83e6a448bbec969bf2fe0a63f43c57 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -12,7 +12,7 @@
| 参数 | 解释 | 是否必选 |
|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
-| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对,不支持 MSAdapter 场景;
"grad_probe":梯度监控, 不支持 MSAdapter 场景;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe),
[1.8 task 配置为 structure](#18-task-配置为-structure)。
**配置示例**:"task": "tensor"。 | 否 |
+| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对,不支持 MSAdapter 场景;
"grad_probe":梯度监控, 不支持 MSAdapter 场景;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe),
[1.8 task 配置为 structure](#18-task-配置为-structure),
[1.9 task 配置为 exception_dump](#19-task-配置为-exception_dump)。
**配置示例**:"task": "tensor"。 | 否 |
| dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 |
| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。静态图 L0 级别 dump 暂不支持指定rank。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 |
| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 |
@@ -223,3 +223,12 @@ structure 模式仅采集模型结构,无其他特殊配置。
- [PyTorch场景](03.config_examples.md#16-task-配置为-structure)
- [MindSpore动态图场景](03.config_examples.md#35-task-配置为-structure)
+### 1.9 task 配置为 exception_dump
+MindSpore 动态图场景下,"level"须为"L2"; MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O0"或"O1"。
+
+在运行过程中会在指定目录下生成kernel_graph_exception_dump.json的中间文件,该文件包含异常dump的相关设置。
+除中间文件外的其他 dump 结果文件请参见 MindSpore 官方文档中的[ Ascend 下 O0/O1 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D)
+
+**示例**:
+ - [MindSpore动态图场景](03.config_examples.md#36-task-配置为-exception_dump)
+ - [MindSpore静态图场景](03.config_examples.md#24-task-配置为-exception_dump)
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/docs/03.config_examples.md b/debug/accuracy_tools/msprobe/docs/03.config_examples.md
index fe411225483885574b75750ad45ec8d2fc2b88fe..6350099be5a100a62d2f196c53c56baec7cca6e4 100644
--- a/debug/accuracy_tools/msprobe/docs/03.config_examples.md
+++ b/debug/accuracy_tools/msprobe/docs/03.config_examples.md
@@ -168,6 +168,18 @@
}
```
+### 2.4 task 配置为 exception_dump
+
+```json
+{
+ "task": "exception_dump",
+ "dump_path": "/home/data_dump",
+ "rank": [],
+ "step": [],
+ "level": "L2"
+}
+```
+
## 3 MindSpore 动态图场景
### 3.1 task 配置为 statistics
@@ -255,3 +267,15 @@
"level": "mix"
}
```
+
+### 3.6 task 配置为 exception_dump
+
+```json
+{
+ "task": "exception_dump",
+ "dump_path": "/home/data_dump",
+ "rank": [],
+ "step": [],
+ "level": "L2"
+}
+```
diff --git a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
index 2b76510282946f00b0022be47be01f53f7422dad..0a89393819a175a1068509cb011cf4c67e8f1923 100644
--- a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
+++ b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
@@ -509,8 +509,7 @@ L2 级别 dump 的目录结构如下所示:
3. acl_dump_{device_id}.json 为在 Dump 接口调用过程中生成的中间文件,一般情况下无需关注。
-其他场景下,除 kernel_kbyk_dump.json(jit_level=O0/O1)、kernel_graph_dump.json(jit_level=O2)等无需关注的中间文件外的其他 dump 结果文件请参见 MindSpore 官方文档中的[ Ascend 下 O0/O1 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D)与[ Ascend 下 O2 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D-1)。
-
+其他场景下,除 kernel_kbyk_dump.json(jit_level=O0/O1)、kernel_graph_dump.json(jit_level=O2)等无需关注的中间文件外的其他 dump 结果文件请参见 MindSpore 官方文档中的[ Ascend 下 O0/O1 模式 Dump 数据对象目录和数据文件介绍](https://www.mindspore.cn/docs/zh-CN/r2.5.0/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D)
### 8.2 动态图场景
dump 结果目录结构示例如下:
diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py
index 34afaa60c2958991e33d351373d0c06cd1e146cc..16dda7a31dd5836af6e3adb5cd17fc39fb17009d 100644
--- a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py
+++ b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py
@@ -43,6 +43,7 @@ from msprobe.mindspore.task_handler_factory import TaskHandlerFactory
try:
from mindspore._c_expression import _dump_start, _dump_stop, _dump_step, _set_init_iter, _dump_set_dynamic
+ import mindspore as ms
except ImportError:
enable_dynamic_kbyk_dump = False
else:
@@ -165,6 +166,7 @@ class PrecisionDebugger(BasePrecisionDebugger):
else:
Runtime.is_running = False
if enable_dynamic_kbyk_dump and instance.config.level_ori == Const.LEVEL_L2:
+ ms.runtime.synchronize()
_dump_stop()
if cls._is_kernel_dump() and _msprobe_c:
_msprobe_c._PrecisionDebugger().stop()
diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..db55811de80c4abfa738e50610b5582984b9e08d
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from msprobe.core.common.log import logger
+from msprobe.mindspore.common.const import Const
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+from msprobe.mindspore.exception_dump.kernel_graph_exception_dump import KernelGraphExceptionDump
+
+
+class ExceptionDumpToolFactory:
+ tools = {
+ Const.CELL: {
+ Const.GRAPH_KBYK_MODE: None,
+ Const.GRAPH_GE_MODE: None,
+ Const.PYNATIVE_MODE: None
+ },
+ Const.API: {
+ Const.GRAPH_KBYK_MODE: None,
+ Const.GRAPH_GE_MODE: None,
+ Const.PYNATIVE_MODE: None
+ },
+ Const.KERNEL: {
+ Const.GRAPH_KBYK_MODE: KernelGraphExceptionDump,
+ Const.GRAPH_GE_MODE: None,
+ Const.PYNATIVE_MODE: KernelGraphExceptionDump
+ }
+ }
+
+ @staticmethod
+ def create(config: DebuggerConfig):
+ tool = ExceptionDumpToolFactory.tools.get(config.level)
+ if not tool:
+ raise Exception("Valid level is needed.")
+ tool = tool.get(config.execution_mode)
+ if not tool:
+ logger.error(f"Exception dump is not supported in {config.execution_mode} mode "
+ f"when level is {config.level}.")
+ raise ValueError
+ return (tool(config),)
diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c4e6f72c129dccba86ff85319d7fdc6a139225
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from msprobe.core.common.file_utils import create_directory, save_json
+from msprobe.mindspore.common.log import logger
+from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
+
+
+class KernelGraphExceptionDump:
+
+ def __init__(self, config: DebuggerConfig):
+ self.dump_json = dict()
+ self.dump_json["common_dump_settings"] = dict()
+ self.dump_json["common_dump_settings"]["dump_mode"] = 0
+ self.dump_json["common_dump_settings"]["path"] = ""
+ self.dump_json["common_dump_settings"]["net_name"] = "Net"
+ self.dump_json["common_dump_settings"]["iteration"] = "all"
+ self.dump_json["common_dump_settings"]["saved_data"] = "tensor"
+ self.dump_json["common_dump_settings"]["input_output"] = 0
+ self.dump_json["common_dump_settings"]["kernels"] = []
+ self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7]
+ self.dump_json["common_dump_settings"]["op_debug_mode"] = 4
+ self.dump_json["common_dump_settings"]["file_format"] = "npy"
+ self.dump_json["e2e_dump_settings"] = dict()
+ self.dump_json["e2e_dump_settings"]["enable"] = not config.async_dump
+ self.dump_json["e2e_dump_settings"]["trans_flag"] = True
+
+ if config.stat_cal_mode and config.device_stat_precision_mode:
+ self.dump_json["e2e_dump_settings"]["stat_calc_mode"] = config.stat_cal_mode
+ self.dump_json["e2e_dump_settings"]["device_stat_precision_mode"] = config.device_stat_precision_mode
+ self.dump_json["common_dump_settings"]["path"] = config.dump_path
+ if len(config.step) > 0:
+ logger.warning("Step would change to all in this task.")
+ if len(config.rank) > 0:
+ self.dump_json["common_dump_settings"]["support_device"] = config.rank
+
+ def handle(self):
+ json_path = self.dump_json["common_dump_settings"]["path"]
+ create_directory(json_path)
+ json_path = os.path.join(json_path, "kernel_graph_exception_check.json")
+ save_json(json_path, self.dump_json, indent=4)
+ logger.info(json_path + " has been created.")
+ os.environ["MINDSPORE_DUMP_CONFIG"] = json_path
diff --git a/debug/accuracy_tools/msprobe/mindspore/ms_config.py b/debug/accuracy_tools/msprobe/mindspore/ms_config.py
index f67754cfde47c89526ecac34d309d262ad8cbc85..e7250332afc366d4f66133ca087374171fbdf812 100644
--- a/debug/accuracy_tools/msprobe/mindspore/ms_config.py
+++ b/debug/accuracy_tools/msprobe/mindspore/ms_config.py
@@ -80,6 +80,12 @@ class OverflowCheckConfig(BaseConfig):
raise Exception("check_mode is invalid")
+class ExceptionDumpConfig(BaseConfig):
+ def __init__(self, json_config):
+ super().__init__(json_config)
+ self.data_mode = ["all"]
+
+
class FreeBenchmarkConfig(BaseConfig):
def __init__(self, task_config):
super().__init__(task_config)
@@ -129,7 +135,8 @@ TaskDict = {
Const.OVERFLOW_CHECK: OverflowCheckConfig,
Const.FREE_BENCHMARK: FreeBenchmarkConfig,
Const.GRAD_PROBE: GradProbeConfig,
- Const.STRUCTURE: StructureConfig
+ Const.STRUCTURE: StructureConfig,
+ Const.EXCEPTION_DUMP: ExceptionDumpConfig
}
diff --git a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py
index 0b3ed6221c2c4fc2d380072a480f35d5815cb89e..bae3104c249654dee5962b0d509cb53db5666b74 100644
--- a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py
+++ b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py
@@ -32,7 +32,7 @@ class OverflowCheckToolFactory:
Const.PYNATIVE_MODE: None
},
Const.KERNEL: {
- Const.GRAPH_KBYK_MODE: None,
+ Const.GRAPH_KBYK_MODE: KernelGraphOverflowCheck,
Const.GRAPH_GE_MODE: KernelGraphOverflowCheck,
Const.PYNATIVE_MODE: None
}
diff --git a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py
index 10b74ea22b02d0668d0b3b17a569c5e1a67c1dd8..cad37cebe8b5de39e3954da0eea2edd26b79223e 100644
--- a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py
+++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py
@@ -18,6 +18,7 @@ from msprobe.mindspore.debugger.debugger_config import DebuggerConfig
from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory
from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory
from msprobe.mindspore.free_benchmark.self_check_tool_factory import SelfCheckToolFactory
+from msprobe.mindspore.exception_dump.exception_dump_tool_factory import ExceptionDumpToolFactory
class TaskHandlerFactory:
@@ -25,7 +26,8 @@ class TaskHandlerFactory:
Const.TENSOR: DumpToolFactory,
Const.STATISTICS: DumpToolFactory,
Const.OVERFLOW_CHECK: OverflowCheckToolFactory,
- Const.FREE_BENCHMARK: SelfCheckToolFactory
+ Const.FREE_BENCHMARK: SelfCheckToolFactory,
+ Const.EXCEPTION_DUMP: ExceptionDumpToolFactory
}
@staticmethod