From e84ca6bc7390efefbb1be36a66f008370e7a86f4 Mon Sep 17 00:00:00 2001 From: curry3 <485078529@qq.com> Date: Sat, 12 Jul 2025 16:10:22 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90bugfix=E3=80=91=E4=BF=AE=E5=A4=8D=20du?= =?UTF-8?q?mp=20=E9=87=87=E9=9B=86=E6=95=B0=E6=8D=AE=E4=B8=8D=E5=87=86?= =?UTF-8?q?=E7=A1=AE=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_processor/mindspore_processor.py | 4 +- .../data_processor/pytorch_processor.py | 4 +- .../msprobe/docs/02.config_introduction.md | 2 +- .../docs/28.debugger_save_instruction.md | 2 +- .../mindspore/debugger/debugger_config.py | 24 +++++-- .../pytorch/debugger/debugger_config.py | 30 +++++---- .../save/test_debugger_save_mindspore.py | 65 ------------------- .../debugger/test_pt_debugger_config.py | 10 +++ .../test_debugger_save_pytorch.py | 31 --------- 9 files changed, 50 insertions(+), 122 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index c33ca941ab..f5c2b89a1b 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -73,7 +73,7 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_stat.max = np.max(data_np).item() tensor_stat.min = np.min(data_np).item() elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy() elif data.dtype == ms.complex64 or data.dtype == ms.complex128: data_abs = np.abs(data.asnumpy()) tensor_stat.max = np.max(data_abs).item() @@ -97,7 +97,7 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_stat.max = mint.any(data) tensor_stat.min = mint.all(data) elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy() elif data.dtype == ms.complex64 or data.dtype == ms.complex128: logger.warning("Async dump do not support complex data!") return tensor_stat diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index cc170d32ed..a414175498 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -104,7 +104,7 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_stat.max = torch.any(data) tensor_stat.min = torch.all(data) elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone() else: if data.dtype == torch.float64 or not data.is_floating_point(): data = data.float() @@ -127,7 +127,7 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_stat.max = torch.any(data) tensor_stat.min = torch.all(data) elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone() else: if data.dtype == torch.float64 or not data.is_floating_point(): data = data.float() diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md index 84f16429db..d388d697fa 100644 --- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md +++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md @@ -18,7 +18,7 @@ | step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 | | level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明)。
"L1":dump API 级精度数据,默认值,仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"L2":dump kernel 级精度数据,PyTorch 场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore 动态图场景详细介绍见 [MindSpore 动态图场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);MindSpore 静态图场景详细介绍见《MindSpore 场景的数据采集》中的 ["**8.1 静态图场景**"](./06.data_dump_MindSpore.md#81-静态图场景)小节。
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"debug":单点保存功能,详见[单点保存工具](./28.debugger_save_instruction.md)。
**配置示例**:"level": "L1"。 | 否 | | enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 | -| async_dump | 异步 dump 开关,bool 类型, 支持 task 为 tensor 或 statistic 模式, level 支持 L0、 L1、 mix、 debug 模式。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor 的统计量计算。
| 否 | +| async_dump | 异步 dump 开关,bool 类型, 支持 task 为 tensor 或 statistic 模式, level 支持 L0、 L1、 mix、 debug 模式。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式下,summary_mode 不支持 md5 值,也不支持复数类型 tensor 的统计量计算。
| 否 | #### 1.1.1 模块级精度数据 dump 说明 diff --git a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md index 3b85320fa3..7f00aab045 100644 --- a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md +++ b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md @@ -27,7 +27,7 @@ L0, L1, mix级别的dump能力存在盲区,网络中的非API或module的输 | dump_path | 设置 dump 数据目录路径,str 类型。 | 是 | | rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型。 | 否 | | step | 指定采集某个 Step 的数据,list[Union[int, str]] 类型。 | 否 | -| async_dump | 异步 dump 开关,bool 类型。 | 否 | +| async_dump | 异步 dump 开关,bool 类型。该模式下,summary_mode 不支持 md5 值,也不支持复数类型 tensor 的统计量计算。 | 否 | "statistics" 任务子配置项: | 参数 | 解释 | 是否必选 | diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py index 90b84c9002..e57d8d6f38 100644 --- a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py @@ -49,7 +49,7 @@ class DebuggerConfig: self.summary_mode = task_config.summary_mode self.stat_cal_mode = task_config.stat_cal_mode if hasattr(task_config, 'stat_cal_mode') else None self.device_stat_precision_mode = task_config.device_stat_precision_mode \ - if hasattr(task_config, 'device_stat_precision_mode') else None + if hasattr(task_config, 'device_stat_precision_mode') else None self.async_dump = common_config.async_dump if common_config.async_dump else False self.check() self._check_statistics_config(task_config) @@ -115,18 +115,28 @@ class DebuggerConfig: self.check_mode = "all" if not isinstance(self.async_dump, bool): raise Exception("The parameters async_dump should be bool.") - if self.async_dump and self.task == Const.TENSOR: - if self.level_ori == Const.LEVEL_DEBUG: - self.list = [] # async_dump + debug level case ignore list - if not self.list and self.level_ori != Const.LEVEL_DEBUG: - raise Exception("The parameters async_dump is true in tensor task," - " the parameters list cannot be empty.") if self.task == Const.STRUCTURE and self.level_ori not in [Const.LEVEL_L0, Const.LEVEL_MIX]: logger.warning_on_rank_0( f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. " f"If not, the default level is {Const.LEVEL_MIX}." ) self.level_ori = Const.LEVEL_MIX + if self.async_dump: + if self.task == Const.TENSOR: + if self.level_ori == Const.LEVEL_DEBUG: + self.list = [] # async_dump + debug level case ignore list + if not self.list and self.level_ori != Const.LEVEL_DEBUG: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + "The parameters async_dump is true in tensor task, the parameters list cannot be empty." + ) + is_unsupported_mode = self.summary_mode == Const.MD5 or \ + isinstance(self.summary_mode, list) and Const.MD5 in self.summary_mode + if is_unsupported_mode: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + f"The parameters async_dump is true, the parameters summary_mode cannot be/contain md5." + ) return True def check_config_with_l2(self, is_graph_config): diff --git a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py index ed5204c0d4..8e3bfbcaf8 100644 --- a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py +++ b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py @@ -65,7 +65,7 @@ class DebuggerConfig: self.is_backward_kernel_dump = False self._check_and_adjust_config_with_l2() - def check_kwargs(self): + def check(self): if self.task and self.task not in Const.TASK_LIST: raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"The task <{self.task}> is not in the {Const.TASK_LIST}.") @@ -78,22 +78,26 @@ class DebuggerConfig: if not isinstance(self.async_dump, bool): raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"The parameters async_dump should be bool.") - if self.async_dump and self.task == Const.TENSOR: - if self.level == Const.LEVEL_DEBUG: - self.list = [] # async_dump + debug level case ignore list - if not self.list and self.level != Const.LEVEL_DEBUG: - raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, - f"The parameters async_dump is true in tensor task, the parameters list cannot be " - f"empty.") if self.task == Const.STRUCTURE and self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]: logger.warning_on_rank_0( f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. " f"If not, the default level is {Const.LEVEL_MIX}." ) self.level = Const.LEVEL_MIX - - def check(self): - self.check_kwargs() + if self.async_dump: + if self.task == Const.TENSOR: + if self.level == Const.LEVEL_DEBUG: + self.list = [] # async_dump + debug level case ignore list + if not self.list and self.level != Const.LEVEL_DEBUG: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + f"The parameters async_dump is true in tensor task, the parameters list cannot be empty." + ) + if self.summary_mode == Const.MD5: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + f"The parameters async_dump is true, the parameters summary_mode cannot be md5." + ) return True def check_model(self, instance, start_model, token_range=None): @@ -102,7 +106,7 @@ class DebuggerConfig: if token_range and not instance.model: error_info = "The 'model' parameter must be provided when token_range is not None" raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, error_info) - + if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX] and token_range is None: return @@ -123,7 +127,7 @@ class DebuggerConfig: break if error_model is not None: error_info = (f"The 'model' parameter must be a torch.nn.Module or list[torch.nn.Module] " - f"type, currently there is an unsupported {type(error_model)} type.") + f"type, currently there is an unsupported {type(error_model)} type.") raise MsprobeException( MsprobeException.INVALID_PARAM_ERROR, error_info) else: diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py index 1a7d04f9c6..d79fe48a75 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py @@ -221,71 +221,6 @@ class TestDebuggerSave(unittest.TestCase): debug_json_dict = self.read_debug_json_into_dict(debug_json_path) assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions") - def test_async_save_tensor(self, _): - data = {"a": mindspore.Tensor([1., 2.])} - step = [] - async_dump = True - mode = "tensor" - dump_path = os.path.join(test_dir, "debug_save") - config_file_path = os.path.join(test_dir, "config.json") - self.write_config_json(step, async_dump, mode, dump_path, config_file_path) - debugger = PrecisionDebugger(config_file_path) - PrecisionDebugger.save(data, "data_dict", save_backward=False) - PrecisionDebugger.step() - # check npy file - npy_path = os.path.join(dump_path, "step0", "rank", "dump_tensor_data", "data_dict.0.debug.a.npy") - assert self.check_real_npy(npy_path, data["a"]) - # check debug json - target_debug_info = { - "a": { - "type": "mindspore.Tensor", - "dtype": "Float32", - "shape": [ - 2 - ], - "data_name": "data_dict.0.debug.a.npy", - "Max": 2.0, - "Min": 1.0, - "Mean": 1.5, - "Norm": 2.2360680103302 - } - } - debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json") - debug_json_dict = self.read_debug_json_into_dict(debug_json_path) - assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - - @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions") - def test_async_save_md5(self, _): - # async_dump case, md5 configuration not working,only save statistics - data = {"a": mindspore.Tensor([1., 2.])} - step = [] - async_dump = True - mode = "md5" - dump_path = os.path.join(test_dir, "debug_save") - config_file_path = os.path.join(test_dir, "config.json") - self.write_config_json(step, async_dump, mode, dump_path, config_file_path) - debugger = PrecisionDebugger(config_file_path) - PrecisionDebugger.save(data, "data_dict", save_backward=False) - PrecisionDebugger.step() - # check debug json - target_debug_info = { - "a": { - "type": "mindspore.Tensor", - "dtype": "Float32", - "shape": [ - 2 - ], - "Max": 2.0, - "Min": 1.0, - "Mean": 1.5, - "Norm": 2.2360680103302 - } - } - debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json") - debug_json_dict = self.read_debug_json_into_dict(debug_json_path) - assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions") def test_save_multiple_times(self, _): data = {"a": mindspore.Tensor([1., 2.])} diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py index 1a0a97b95e..f086c61c90 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py @@ -84,6 +84,7 @@ class TestDebuggerConfig(unittest.TestCase): self.common_config.task = Const.TENSOR self.common_config.level = Const.LEVEL_MIX self.task_config.list = [] + self.task_config.summary_mode = Const.SUMMARY_MODE with self.assertRaises(MsprobeException) as context: DebuggerConfig(self.common_config, self.task_config, None, None, None) self.assertIn(f"the parameters list cannot be empty.", str(context.exception)) @@ -94,6 +95,15 @@ class TestDebuggerConfig(unittest.TestCase): config = DebuggerConfig(self.common_config, self.task_config, None, None, None) self.assertEqual(config.level, Const.LEVEL_MIX) + def test_check_async_dump_and_md5(self): + self.common_config.async_dump = True + self.common_config.task = Const.STATISTICS + self.common_config.level = Const.LEVEL_L1 + self.task_config.summary_mode = Const.MD5 + with self.assertRaises(MsprobeException) as context: + DebuggerConfig(self.common_config, self.task_config, None, None, None) + self.assertIn(f"the parameters summary_mode cannot be md5.", str(context.exception)) + def test_check_model_with_model_is_none(self): self.common_config.level = Const.LEVEL_L0 instance = MagicMock() diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py index 8d2f4f3fbc..ea23502952 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py @@ -244,37 +244,6 @@ class TestDebuggerSave(unittest.TestCase): debug_json_dict = self.read_debug_json_into_dict(debug_json_path) assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - def test_async_save_md5(self): - # async_dump case, md5 configuration not working,only save statistics - data = {"a": torch.Tensor([1., 2.])} - step = [] - async_dump = True - mode = "md5" - dump_path = os.path.join(test_dir, "debug_save") - config_file_path = os.path.join(test_dir, "config.json") - self.write_config_json(step, async_dump, mode, dump_path, config_file_path) - debugger = PrecisionDebugger(config_file_path) - PrecisionDebugger.save(data, "data_dict", save_backward=False) - PrecisionDebugger.step() - # check debug json - target_debug_info = { - "a": { - "type": "torch.Tensor", - "dtype": "torch.float32", - "shape": [ - 2 - ], - "Max": 2.0, - "Min": 1.0, - "Mean": 1.5, - "Norm": 2.2360680103302, - "requires_grad": False, - } - } - debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json") - debug_json_dict = self.read_debug_json_into_dict(debug_json_path) - assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - def test_save_multiple_times(self): data = {"a": torch.Tensor([1., 2.])} step = [] -- Gitee