From 2ab1fa59f00f4ea74cd3bbe4ca28113e23587f17 Mon Sep 17 00:00:00 2001 From: qqqhhhbbb Date: Sat, 26 Jul 2025 10:32:36 +0800 Subject: [PATCH 1/2] add exception dump --- .../msprobe/core/common/const.py | 3 +- .../mindspore/exception_check/__init__.py | 0 .../exception_check_tool_factory.py | 51 ++++++++++++ .../kernel_graph_exception_check.py | 81 +++++++++++++++++++ .../msprobe/mindspore/ms_config.py | 16 +++- .../overflow_check_tool_factory.py | 2 +- .../msprobe/mindspore/task_handler_factory.py | 4 +- 7 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/mindspore/exception_check/__init__.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/exception_check/exception_check_tool_factory.py create mode 100644 debug/accuracy_tools/msprobe/mindspore/exception_check/kernel_graph_exception_check.py diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 560d939b34..f8d8045e89 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -113,9 +113,10 @@ class Const: RUN_UT = "run_ut" GRAD_PROBE = "grad_probe" STRUCTURE = "structure" + EXCEPTION_CHECK = "exception_check" DUMP_PRECISION_HIGH = "high" DUMP_PRECISION_LOW = "low" - TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE] + TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE, EXCEPTION_CHECK] DUMP_DATA_COLLECTION_LIST = [STATISTICS, TENSOR, STRUCTURE] DUMP_DATA_MODE_LIST = [ALL, INPUT, OUTPUT, FORWARD, BACKWARD] DUMP_PRECISION_LIST = [DUMP_PRECISION_LOW, DUMP_PRECISION_HIGH] diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_check/__init__.py b/debug/accuracy_tools/msprobe/mindspore/exception_check/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_check/exception_check_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/exception_check/exception_check_tool_factory.py new file mode 100644 index 0000000000..1115b16cee --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/exception_check/exception_check_tool_factory.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from msprobe.core.common.log import logger +from msprobe.mindspore.common.const import Const +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig +from msprobe.mindspore.exception_check.kernel_graph_exception_check import KernelGraphExceptionCheck + + +class ExceptionCheckToolFactory: + tools = { + Const.CELL: { + Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: None + }, + Const.API: { + Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: None + }, + Const.KERNEL: { + Const.GRAPH_KBYK_MODE: KernelGraphExceptionCheck, + Const.GRAPH_GE_MODE: None, + Const.PYNATIVE_MODE: KernelGraphExceptionCheck + } + } + + @staticmethod + def create(config: DebuggerConfig): + tool = ExceptionCheckToolFactory.tools.get(config.level) + if not tool: + raise Exception("Valid level is needed.") + tool = tool.get(config.execution_mode) + if not tool: + logger.error(f"Overflow check is not supported in {config.execution_mode} mode " + f"when level is {config.level}.") + raise ValueError + return (tool(config),) diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_check/kernel_graph_exception_check.py b/debug/accuracy_tools/msprobe/mindspore/exception_check/kernel_graph_exception_check.py new file mode 100644 index 0000000000..0dd3c03cf2 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/exception_check/kernel_graph_exception_check.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from msprobe.core.common.file_utils import create_directory, save_json +from msprobe.mindspore.common.log import logger +from msprobe.mindspore.debugger.debugger_config import DebuggerConfig + + +class KernelGraphExceptionCheck: + E2E_SETTINGS = "e2e_dump_settings" + + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "full" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7] + self.dump_json["common_dump_settings"]["op_debug_mode"] = 4 + self.dump_json["common_dump_settings"]["file_format"] = "npy" + + if config.stat_cal_mode and config.device_stat_precision_mode: + e2e_set = { + "enable": not config.async_dump, + "trans_flag": True, + "stat_calc_mode": config.stat_cal_mode, + "device_stat_precision_mode": config.device_stat_precision_mode + } + else: + e2e_set = { + "enable": not config.async_dump, + "trans_flag": True + } + + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + logger.warning("Step would change to all in this task.") + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + if config.check_mode == "aicore": + self.dump_json["common_dump_settings"]["op_debug_mode"] = 1 + elif config.check_mode == "atomic": + self.dump_json["common_dump_settings"]["op_debug_mode"] = 2 + + self.dump_json["e2e_dump_settings"] = e2e_set + def handle(self): + # try: + # from msprobe.lib import _msprobe_c + # return + # except ImportError: + # # 如果没有_msprobe_ce_c走MindSpore老流程 + # logger.info("Module _msprobe_c has not been installed, use interface in mindspore instead.") + + if os.getenv("GRAPH_OP_RUN") == "1": + raise Exception("Must run in graph mode, not kbk mode") + json_path = self.dump_json["common_dump_settings"]["path"] + create_directory(json_path) + json_path = os.path.join(json_path, "kernel_graph_exception_check.json") + save_json(json_path, self.dump_json, indent=4) + logger.info(json_path + " has been created.") + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path + if "MS_ACL_DUMP_CFG_PATH" in os.environ: + del os.environ["MS_ACL_DUMP_CFG_PATH"] diff --git a/debug/accuracy_tools/msprobe/mindspore/ms_config.py b/debug/accuracy_tools/msprobe/mindspore/ms_config.py index 2c27b05aa2..616372a010 100644 --- a/debug/accuracy_tools/msprobe/mindspore/ms_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/ms_config.py @@ -78,6 +78,19 @@ class OverflowCheckConfig(BaseConfig): if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]: raise Exception("check_mode is invalid") +class ExceptionCheckConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.data_mode = ["all"] + self._check_config() + + def _check_config(self): + if self.overflow_nums is not None and not is_int(self.overflow_nums): + raise Exception("overflow_nums is invalid, it should be an integer") + if self.overflow_nums is not None and self.overflow_nums != -1 and self.overflow_nums <= 0: + raise Exception("overflow_nums should be -1 or positive integer") + if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]: + raise Exception("check_mode is invalid") class FreeBenchmarkConfig(BaseConfig): def __init__(self, task_config): @@ -128,7 +141,8 @@ TaskDict = { Const.OVERFLOW_CHECK: OverflowCheckConfig, Const.FREE_BENCHMARK: FreeBenchmarkConfig, Const.GRAD_PROBE: GradProbeConfig, - Const.STRUCTURE: StructureConfig + Const.STRUCTURE: StructureConfig, + Const.EXCEPTION_CHECK: ExceptionCheckConfig } diff --git a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py index 0b3ed6221c..bae3104c24 100644 --- a/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/overflow_check/overflow_check_tool_factory.py @@ -32,7 +32,7 @@ class OverflowCheckToolFactory: Const.PYNATIVE_MODE: None }, Const.KERNEL: { - Const.GRAPH_KBYK_MODE: None, + Const.GRAPH_KBYK_MODE: KernelGraphOverflowCheck, Const.GRAPH_GE_MODE: KernelGraphOverflowCheck, Const.PYNATIVE_MODE: None } diff --git a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py index 10b74ea22b..cfba6028fa 100644 --- a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py @@ -18,6 +18,7 @@ from msprobe.mindspore.debugger.debugger_config import DebuggerConfig from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory from msprobe.mindspore.free_benchmark.self_check_tool_factory import SelfCheckToolFactory +from msprobe.mindspore.exception_check.exception_check_tool_factory import ExceptionCheckToolFactory class TaskHandlerFactory: @@ -25,7 +26,8 @@ class TaskHandlerFactory: Const.TENSOR: DumpToolFactory, Const.STATISTICS: DumpToolFactory, Const.OVERFLOW_CHECK: OverflowCheckToolFactory, - Const.FREE_BENCHMARK: SelfCheckToolFactory + Const.FREE_BENCHMARK: SelfCheckToolFactory, + Const.EXCEPTION_CHECK: ExceptionCheckToolFactory } @staticmethod -- Gitee From 3cefc0313ed61a8b7f43de4bdf9fe05045277501 Mon Sep 17 00:00:00 2001 From: qqqhhhbbb Date: Tue, 29 Jul 2025 17:40:52 +0800 Subject: [PATCH 2/2] add synchronize add synchronize --- debug/accuracy_tools/msprobe/core/common/const.py | 4 ++-- .../msprobe/mindspore/debugger/precision_debugger.py | 2 ++ .../{exception_check => exception_dump}/__init__.py | 0 .../exception_dump_tool_factory.py} | 10 +++++----- .../kernel_graph_exception_dump.py} | 4 ++-- debug/accuracy_tools/msprobe/mindspore/ms_config.py | 4 ++-- .../msprobe/mindspore/task_handler_factory.py | 4 ++-- 7 files changed, 15 insertions(+), 13 deletions(-) rename debug/accuracy_tools/msprobe/mindspore/{exception_check => exception_dump}/__init__.py (100%) rename debug/accuracy_tools/msprobe/mindspore/{exception_check/exception_check_tool_factory.py => exception_dump/exception_dump_tool_factory.py} (83%) rename debug/accuracy_tools/msprobe/mindspore/{exception_check/kernel_graph_exception_check.py => exception_dump/kernel_graph_exception_dump.py} (98%) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index f8d8045e89..32f206fca9 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -113,10 +113,10 @@ class Const: RUN_UT = "run_ut" GRAD_PROBE = "grad_probe" STRUCTURE = "structure" - EXCEPTION_CHECK = "exception_check" + EXCEPTION_DUMP = "exception_dump" DUMP_PRECISION_HIGH = "high" DUMP_PRECISION_LOW = "low" - TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE, EXCEPTION_CHECK] + TASK_LIST = [TENSOR, STATISTICS, OVERFLOW_CHECK, FREE_BENCHMARK, RUN_UT, GRAD_PROBE, STRUCTURE, EXCEPTION_DUMP] DUMP_DATA_COLLECTION_LIST = [STATISTICS, TENSOR, STRUCTURE] DUMP_DATA_MODE_LIST = [ALL, INPUT, OUTPUT, FORWARD, BACKWARD] DUMP_PRECISION_LIST = [DUMP_PRECISION_LOW, DUMP_PRECISION_HIGH] diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py index ddde912946..269b1fae65 100644 --- a/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/precision_debugger.py @@ -43,6 +43,7 @@ from msprobe.mindspore.task_handler_factory import TaskHandlerFactory try: from mindspore._c_expression import _dump_start, _dump_stop, _dump_step, _set_init_iter, _dump_set_dynamic + import mindspore as ms except ImportError: enable_dynamic_kbyk_dump = False else: @@ -166,6 +167,7 @@ class PrecisionDebugger(BasePrecisionDebugger): else: Runtime.is_running = False if enable_dynamic_kbyk_dump: + ms.runtime.synchronize() _dump_stop() if cls._is_kernel_dump() and _msprobe_c: _msprobe_c._PrecisionDebugger().stop() diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_check/__init__.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py similarity index 100% rename from debug/accuracy_tools/msprobe/mindspore/exception_check/__init__.py rename to debug/accuracy_tools/msprobe/mindspore/exception_dump/__init__.py diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_check/exception_check_tool_factory.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py similarity index 83% rename from debug/accuracy_tools/msprobe/mindspore/exception_check/exception_check_tool_factory.py rename to debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py index 1115b16cee..d2c0db8dd3 100644 --- a/debug/accuracy_tools/msprobe/mindspore/exception_check/exception_check_tool_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/exception_dump_tool_factory.py @@ -16,10 +16,10 @@ from msprobe.core.common.log import logger from msprobe.mindspore.common.const import Const from msprobe.mindspore.debugger.debugger_config import DebuggerConfig -from msprobe.mindspore.exception_check.kernel_graph_exception_check import KernelGraphExceptionCheck +from msprobe.mindspore.exception_dump.kernel_graph_exception_dump import KernelGraphExceptionDump -class ExceptionCheckToolFactory: +class ExceptionDumpToolFactory: tools = { Const.CELL: { Const.GRAPH_KBYK_MODE: None, @@ -32,15 +32,15 @@ class ExceptionCheckToolFactory: Const.PYNATIVE_MODE: None }, Const.KERNEL: { - Const.GRAPH_KBYK_MODE: KernelGraphExceptionCheck, + Const.GRAPH_KBYK_MODE: KernelGraphExceptionDump, Const.GRAPH_GE_MODE: None, - Const.PYNATIVE_MODE: KernelGraphExceptionCheck + Const.PYNATIVE_MODE: KernelGraphExceptionDump } } @staticmethod def create(config: DebuggerConfig): - tool = ExceptionCheckToolFactory.tools.get(config.level) + tool = ExceptionDumpToolFactory.tools.get(config.level) if not tool: raise Exception("Valid level is needed.") tool = tool.get(config.execution_mode) diff --git a/debug/accuracy_tools/msprobe/mindspore/exception_check/kernel_graph_exception_check.py b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py similarity index 98% rename from debug/accuracy_tools/msprobe/mindspore/exception_check/kernel_graph_exception_check.py rename to debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py index 0dd3c03cf2..72f1a55a76 100644 --- a/debug/accuracy_tools/msprobe/mindspore/exception_check/kernel_graph_exception_check.py +++ b/debug/accuracy_tools/msprobe/mindspore/exception_dump/kernel_graph_exception_dump.py @@ -20,7 +20,7 @@ from msprobe.mindspore.common.log import logger from msprobe.mindspore.debugger.debugger_config import DebuggerConfig -class KernelGraphExceptionCheck: +class KernelGraphExceptionDump: E2E_SETTINGS = "e2e_dump_settings" def __init__(self, config: DebuggerConfig): @@ -30,7 +30,7 @@ class KernelGraphExceptionCheck: self.dump_json["common_dump_settings"]["path"] = "" self.dump_json["common_dump_settings"]["net_name"] = "Net" self.dump_json["common_dump_settings"]["iteration"] = "all" - self.dump_json["common_dump_settings"]["saved_data"] = "full" + self.dump_json["common_dump_settings"]["saved_data"] = "tensor" self.dump_json["common_dump_settings"]["input_output"] = 0 self.dump_json["common_dump_settings"]["kernels"] = [] self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7] diff --git a/debug/accuracy_tools/msprobe/mindspore/ms_config.py b/debug/accuracy_tools/msprobe/mindspore/ms_config.py index 616372a010..410701ec49 100644 --- a/debug/accuracy_tools/msprobe/mindspore/ms_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/ms_config.py @@ -78,7 +78,7 @@ class OverflowCheckConfig(BaseConfig): if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]: raise Exception("check_mode is invalid") -class ExceptionCheckConfig(BaseConfig): +class ExceptionDumpConfig(BaseConfig): def __init__(self, json_config): super().__init__(json_config) self.data_mode = ["all"] @@ -142,7 +142,7 @@ TaskDict = { Const.FREE_BENCHMARK: FreeBenchmarkConfig, Const.GRAD_PROBE: GradProbeConfig, Const.STRUCTURE: StructureConfig, - Const.EXCEPTION_CHECK: ExceptionCheckConfig + Const.EXCEPTION_DUMP: ExceptionDumpConfig } diff --git a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py index cfba6028fa..cad37cebe8 100644 --- a/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py +++ b/debug/accuracy_tools/msprobe/mindspore/task_handler_factory.py @@ -18,7 +18,7 @@ from msprobe.mindspore.debugger.debugger_config import DebuggerConfig from msprobe.mindspore.dump.dump_tool_factory import DumpToolFactory from msprobe.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory from msprobe.mindspore.free_benchmark.self_check_tool_factory import SelfCheckToolFactory -from msprobe.mindspore.exception_check.exception_check_tool_factory import ExceptionCheckToolFactory +from msprobe.mindspore.exception_dump.exception_dump_tool_factory import ExceptionDumpToolFactory class TaskHandlerFactory: @@ -27,7 +27,7 @@ class TaskHandlerFactory: Const.STATISTICS: DumpToolFactory, Const.OVERFLOW_CHECK: OverflowCheckToolFactory, Const.FREE_BENCHMARK: SelfCheckToolFactory, - Const.EXCEPTION_CHECK: ExceptionCheckToolFactory + Const.EXCEPTION_DUMP: ExceptionDumpToolFactory } @staticmethod -- Gitee