From e84ca6bc7390efefbb1be36a66f008370e7a86f4 Mon Sep 17 00:00:00 2001
From: curry3 <485078529@qq.com>
Date: Sat, 12 Jul 2025 16:10:22 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90bugfix=E3=80=91=E4=BF=AE=E5=A4=8D=20du?=
 =?UTF-8?q?mp=20=E9=87=87=E9=9B=86=E6=95=B0=E6=8D=AE=E4=B8=8D=E5=87=86?=
 =?UTF-8?q?=E7=A1=AE=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../data_processor/mindspore_processor.py     |  4 +-
 .../data_processor/pytorch_processor.py       |  4 +-
 .../msprobe/docs/02.config_introduction.md    |  2 +-
 .../docs/28.debugger_save_instruction.md      |  2 +-
 .../mindspore/debugger/debugger_config.py     | 24 +++++--
 .../pytorch/debugger/debugger_config.py       | 30 +++++----
 .../save/test_debugger_save_mindspore.py      | 65 -------------------
 .../debugger/test_pt_debugger_config.py       | 10 +++
 .../test_debugger_save_pytorch.py             | 31 ---------
 9 files changed, 50 insertions(+), 122 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py
index c33ca941ab..f5c2b89a1b 100644
--- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py
+++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py
@@ -73,7 +73,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
             tensor_stat.max = np.max(data_np).item()
             tensor_stat.min = np.min(data_np).item()
         elif not data.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy()
         elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
             data_abs = np.abs(data.asnumpy())
             tensor_stat.max = np.max(data_abs).item()
@@ -97,7 +97,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
             tensor_stat.max = mint.any(data)
             tensor_stat.min = mint.all(data)
         elif not data.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy()
         elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
             logger.warning("Async dump do not support complex data!")
             return tensor_stat
diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
index cc170d32ed..a414175498 100644
--- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
+++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
@@ -104,7 +104,7 @@ class PytorchDataProcessor(BaseDataProcessor):
             tensor_stat.max = torch.any(data)
             tensor_stat.min = torch.all(data)
         elif not data.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone()
         else:
             if data.dtype == torch.float64 or not data.is_floating_point():
                 data = data.float()
@@ -127,7 +127,7 @@ class PytorchDataProcessor(BaseDataProcessor):
             tensor_stat.max = torch.any(data)
             tensor_stat.min = torch.all(data)
         elif not data.shape:
-            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+            tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone()
         else:
             if data.dtype == torch.float64 or not data.is_floating_point():
                 data = data.float()
diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index 84f16429db..d388d697fa 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -18,7 +18,7 @@
 | step              | 指定采集某个 step 的数据，list[Union[int, str]] 类型。默认未配置，表示采集所有 step 数据。采集特定 step 时，须指定为训练脚本中存在的 step，可逐个配置，也可以指定范围。<br/>  **配置示例**："step": [0, 1 , 2, "4-6"]。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | 否       |
 | level             | dump 级别，str 类型，根据不同级别采集不同数据。可选参数：<br/>"L0"：dump 模块级精度数据，使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明)。<br/>"L1"：dump API 级精度数据，默认值，仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。<br/>"L2"：dump kernel 级精度数据，PyTorch 场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md)；MindSpore 动态图场景详细介绍见 [MindSpore 动态图场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md)；MindSpore 静态图场景详细介绍见《MindSpore 场景的数据采集》中的 ["**8.1 静态图场景**"](./06.data_dump_MindSpore.md#81-静态图场景)小节。<br/>"mix"：dump module 模块级和 API 级精度数据，即"L0"+"L1"，仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。<br/>"debug"：单点保存功能，详见[单点保存工具](./28.debugger_save_instruction.md)。<br/>  **配置示例**："level": "L1"。                                                                                                     | 否 |
 | enable_dataloader | 自动控制开关，bool 类型，仅 PyTorch 场景支持。可选参数 true（开启）或 false（关闭），默认为 false。配置为 true 后自动识别 step 参数指定的迭代，并在该迭代执行完成后退出训练，此时 start、stop 和 step 函数可不配置，开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用，分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | 否       |
-| async_dump        | 异步 dump 开关，bool 类型， 支持 task 为 tensor 或 statistic 模式， level 支持 L0、 L1、 mix、 debug 模式。可选参数 true（开启）或 false（关闭），默认为 false。配置为 true 后开启异步 dump，即采集的精度数据会在当前 step 训练结束后统一落盘，训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险，当 task 配置为 tensor 时，即真实数据的异步dump模式，必须配置 [list](#13-task-配置为-tensor) 参数，指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor 的统计量计算。<br/>                                                                                                                                                                                                                                                                                                                                                                                                                                              | 否       |
+| async_dump        | 异步 dump 开关，bool 类型， 支持 task 为 tensor 或 statistic 模式， level 支持 L0、 L1、 mix、 debug 模式。可选参数 true（开启）或 false（关闭），默认为 false。配置为 true 后开启异步 dump，即采集的精度数据会在当前 step 训练结束后统一落盘，训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险，当 task 配置为 tensor 时，即真实数据的异步dump模式，必须配置 [list](#13-task-配置为-tensor) 参数，指定需要 dump 的 tensor 。该模式下，summary_mode 不支持 md5 值，也不支持复数类型 tensor 的统计量计算。 <br/>                                                                                                                                                                                                                                                                                                                                                                                     | 否       |
 
 #### 1.1.1 模块级精度数据 dump 说明
 
diff --git a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md
index 3b85320fa3..7f00aab045 100644
--- a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md
+++ b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md
@@ -27,7 +27,7 @@ L0, L1, mix级别的dump能力存在盲区，网络中的非API或module的输
 | dump_path  | 设置 dump 数据目录路径，str 类型。 | 是       |
 | rank        | 指定对某张卡上的数据进行采集，list[Union[int, str]] 类型。  | 否       |
 | step    |   指定采集某个 Step 的数据，list[Union[int, str]] 类型。  | 否       |
-| async_dump        | 异步 dump 开关，bool 类型。  | 否       |
+| async_dump        | 异步 dump 开关，bool 类型。该模式下，summary_mode 不支持 md5 值，也不支持复数类型 tensor 的统计量计算。   | 否       |
 
 "statistics" 任务子配置项：
 | 参数     | 解释                                       | 是否必选 |
diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py
index 90b84c9002..e57d8d6f38 100644
--- a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py
+++ b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py
@@ -49,7 +49,7 @@ class DebuggerConfig:
         self.summary_mode = task_config.summary_mode
         self.stat_cal_mode = task_config.stat_cal_mode if hasattr(task_config, 'stat_cal_mode') else None
         self.device_stat_precision_mode = task_config.device_stat_precision_mode \
-                                          if hasattr(task_config, 'device_stat_precision_mode') else None
+            if hasattr(task_config, 'device_stat_precision_mode') else None
         self.async_dump = common_config.async_dump if common_config.async_dump else False
         self.check()
         self._check_statistics_config(task_config)
@@ -115,18 +115,28 @@ class DebuggerConfig:
             self.check_mode = "all"
         if not isinstance(self.async_dump, bool):
             raise Exception("The parameters async_dump should be bool.")
-        if self.async_dump and self.task == Const.TENSOR:
-            if self.level_ori == Const.LEVEL_DEBUG:
-                self.list = [] # async_dump + debug level case ignore list
-            if not self.list and self.level_ori != Const.LEVEL_DEBUG:
-                raise Exception("The parameters async_dump is true in tensor task,"
-                                " the parameters list cannot be empty.")
         if self.task == Const.STRUCTURE and self.level_ori not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             logger.warning_on_rank_0(
                 f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
                 f"If not, the default level is {Const.LEVEL_MIX}."
             )
             self.level_ori = Const.LEVEL_MIX
+        if self.async_dump:
+            if self.task == Const.TENSOR:
+                if self.level_ori == Const.LEVEL_DEBUG:
+                    self.list = []  # async_dump + debug level case ignore list
+                if not self.list and self.level_ori != Const.LEVEL_DEBUG:
+                    raise MsprobeException(
+                        MsprobeException.INVALID_PARAM_ERROR,
+                        "The parameters async_dump is true in tensor task, the parameters list cannot be empty."
+                    )
+            is_unsupported_mode = self.summary_mode == Const.MD5 or \
+                                  isinstance(self.summary_mode, list) and Const.MD5 in self.summary_mode
+            if is_unsupported_mode:
+                raise MsprobeException(
+                    MsprobeException.INVALID_PARAM_ERROR,
+                    f"The parameters async_dump is true, the parameters summary_mode cannot be/contain md5."
+                )
         return True
 
     def check_config_with_l2(self, is_graph_config):
diff --git a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py
index ed5204c0d4..8e3bfbcaf8 100644
--- a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py
+++ b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py
@@ -65,7 +65,7 @@ class DebuggerConfig:
             self.is_backward_kernel_dump = False
             self._check_and_adjust_config_with_l2()
 
-    def check_kwargs(self):
+    def check(self):
         if self.task and self.task not in Const.TASK_LIST:
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The task <{self.task}> is not in the {Const.TASK_LIST}.")
@@ -78,22 +78,26 @@ class DebuggerConfig:
         if not isinstance(self.async_dump, bool):
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
                                    f"The parameters async_dump should be bool.")
-        if self.async_dump and self.task == Const.TENSOR:
-            if self.level == Const.LEVEL_DEBUG:
-                self.list = [] # async_dump + debug level case ignore list
-            if not self.list and self.level != Const.LEVEL_DEBUG:
-                raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
-                                    f"The parameters async_dump is true in tensor task, the parameters list cannot be "
-                                    f"empty.")
         if self.task == Const.STRUCTURE and self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
             logger.warning_on_rank_0(
                 f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
                 f"If not, the default level is {Const.LEVEL_MIX}."
             )
             self.level = Const.LEVEL_MIX
-
-    def check(self):
-        self.check_kwargs()
+        if self.async_dump:
+            if self.task == Const.TENSOR:
+                if self.level == Const.LEVEL_DEBUG:
+                    self.list = []  # async_dump + debug level case ignore list
+                if not self.list and self.level != Const.LEVEL_DEBUG:
+                    raise MsprobeException(
+                        MsprobeException.INVALID_PARAM_ERROR,
+                        f"The parameters async_dump is true in tensor task, the parameters list cannot be empty."
+                    )
+            if self.summary_mode == Const.MD5:
+                raise MsprobeException(
+                    MsprobeException.INVALID_PARAM_ERROR,
+                    f"The parameters async_dump is true, the parameters summary_mode cannot be md5."
+                )
         return True
 
     def check_model(self, instance, start_model, token_range=None):
@@ -102,7 +106,7 @@ class DebuggerConfig:
         if token_range and not instance.model:
             error_info = "The 'model' parameter must be provided when token_range is not None"
             raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, error_info)
-        
+
         if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX] and token_range is None:
             return
 
@@ -123,7 +127,7 @@ class DebuggerConfig:
                     break
             if error_model is not None:
                 error_info = (f"The 'model' parameter must be a torch.nn.Module or list[torch.nn.Module] "
-                            f"type, currently there is an unsupported {type(error_model)} type.")
+                              f"type, currently there is an unsupported {type(error_model)} type.")
                 raise MsprobeException(
                     MsprobeException.INVALID_PARAM_ERROR, error_info)
         else:
diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py
index 1a7d04f9c6..d79fe48a75 100644
--- a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py
+++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py
@@ -221,71 +221,6 @@ class TestDebuggerSave(unittest.TestCase):
             debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
             assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
 
-    @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions")
-    def test_async_save_tensor(self, _):
-        data = {"a": mindspore.Tensor([1., 2.])}
-        step = []
-        async_dump = True
-        mode = "tensor"
-        dump_path = os.path.join(test_dir, "debug_save")
-        config_file_path = os.path.join(test_dir, "config.json")
-        self.write_config_json(step, async_dump, mode, dump_path, config_file_path)
-        debugger =  PrecisionDebugger(config_file_path)
-        PrecisionDebugger.save(data, "data_dict", save_backward=False)
-        PrecisionDebugger.step()
-        # check npy file
-        npy_path = os.path.join(dump_path, "step0", "rank", "dump_tensor_data", "data_dict.0.debug.a.npy")
-        assert self.check_real_npy(npy_path, data["a"])
-        # check debug json
-        target_debug_info = {
-            "a": {
-                "type": "mindspore.Tensor",
-                "dtype": "Float32",
-                "shape": [
-                2
-                ],
-                "data_name": "data_dict.0.debug.a.npy",
-                "Max": 2.0,
-                "Min": 1.0,
-                "Mean": 1.5,
-                "Norm": 2.2360680103302
-            }
-        }
-        debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json")
-        debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
-        assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
-
-    @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions")
-    def test_async_save_md5(self, _):
-        # async_dump case, md5 configuration not working,only save statistics
-        data = {"a": mindspore.Tensor([1., 2.])}
-        step = []
-        async_dump = True
-        mode = "md5"
-        dump_path = os.path.join(test_dir, "debug_save")
-        config_file_path = os.path.join(test_dir, "config.json")
-        self.write_config_json(step, async_dump, mode, dump_path, config_file_path)
-        debugger =  PrecisionDebugger(config_file_path)
-        PrecisionDebugger.save(data, "data_dict", save_backward=False)
-        PrecisionDebugger.step()
-        # check debug json
-        target_debug_info = {
-            "a": {
-                "type": "mindspore.Tensor",
-                "dtype": "Float32",
-                "shape": [
-                2
-                ],
-                "Max": 2.0,
-                "Min": 1.0,
-                "Mean": 1.5,
-                "Norm": 2.2360680103302
-            }
-        }
-        debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json")
-        debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
-        assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
-
     @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions")
     def test_save_multiple_times(self, _):
         data = {"a": mindspore.Tensor([1., 2.])}
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py
index 1a0a97b95e..f086c61c90 100644
--- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py
@@ -84,6 +84,7 @@ class TestDebuggerConfig(unittest.TestCase):
         self.common_config.task = Const.TENSOR
         self.common_config.level = Const.LEVEL_MIX
         self.task_config.list = []
+        self.task_config.summary_mode = Const.SUMMARY_MODE
         with self.assertRaises(MsprobeException) as context:
             DebuggerConfig(self.common_config, self.task_config, None, None, None)
         self.assertIn(f"the parameters list cannot be empty.", str(context.exception))
@@ -94,6 +95,15 @@ class TestDebuggerConfig(unittest.TestCase):
         config = DebuggerConfig(self.common_config, self.task_config, None, None, None)
         self.assertEqual(config.level, Const.LEVEL_MIX)
 
+    def test_check_async_dump_and_md5(self):
+        self.common_config.async_dump = True
+        self.common_config.task = Const.STATISTICS
+        self.common_config.level = Const.LEVEL_L1
+        self.task_config.summary_mode = Const.MD5
+        with self.assertRaises(MsprobeException) as context:
+            DebuggerConfig(self.common_config, self.task_config, None, None, None)
+        self.assertIn(f"the parameters summary_mode cannot be md5.", str(context.exception))
+
     def test_check_model_with_model_is_none(self):
         self.common_config.level = Const.LEVEL_L0
         instance = MagicMock()
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py
index 8d2f4f3fbc..ea23502952 100644
--- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py
@@ -244,37 +244,6 @@ class TestDebuggerSave(unittest.TestCase):
         debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
         assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
 
-    def test_async_save_md5(self):
-        # async_dump case, md5 configuration not working,only save statistics
-        data = {"a": torch.Tensor([1., 2.])}
-        step = []
-        async_dump = True
-        mode = "md5"
-        dump_path = os.path.join(test_dir, "debug_save")
-        config_file_path = os.path.join(test_dir, "config.json")
-        self.write_config_json(step, async_dump, mode, dump_path, config_file_path)
-        debugger =  PrecisionDebugger(config_file_path)
-        PrecisionDebugger.save(data, "data_dict", save_backward=False)
-        PrecisionDebugger.step()
-        # check debug json
-        target_debug_info = {
-            "a": {
-                "type": "torch.Tensor",
-                "dtype": "torch.float32",
-                "shape": [
-                2
-                ],
-                "Max": 2.0,
-                "Min": 1.0,
-                "Mean": 1.5,
-                "Norm": 2.2360680103302,
-                "requires_grad": False,
-            }
-        }
-        debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json")
-        debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
-        assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
-
     def test_save_multiple_times(self):
         data = {"a": torch.Tensor([1., 2.])}
         step = []
-- 
Gitee