diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py
index c33ca941abb7954c7dc3066598bffba60ff4cdb9..f5c2b89a1b49e881e601e895bdfcdb9e59a51acf 100644
--- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py
+++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py
@@ -73,7 +73,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
tensor_stat.max = np.max(data_np).item()
tensor_stat.min = np.min(data_np).item()
elif not data.shape:
- tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+ tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy()
elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
data_abs = np.abs(data.asnumpy())
tensor_stat.max = np.max(data_abs).item()
@@ -97,7 +97,7 @@ class MindsporeDataProcessor(BaseDataProcessor):
tensor_stat.max = mint.any(data)
tensor_stat.min = mint.all(data)
elif not data.shape:
- tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+ tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy()
elif data.dtype == ms.complex64 or data.dtype == ms.complex128:
logger.warning("Async dump do not support complex data!")
return tensor_stat
diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
index cc170d32ed0393250246fea7a92e52059745ba1a..a4141754981be151fca995bfc91955530bc8d933 100644
--- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
+++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
@@ -104,7 +104,7 @@ class PytorchDataProcessor(BaseDataProcessor):
tensor_stat.max = torch.any(data)
tensor_stat.min = torch.all(data)
elif not data.shape:
- tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+ tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone()
else:
if data.dtype == torch.float64 or not data.is_floating_point():
data = data.float()
@@ -127,7 +127,7 @@ class PytorchDataProcessor(BaseDataProcessor):
tensor_stat.max = torch.any(data)
tensor_stat.min = torch.all(data)
elif not data.shape:
- tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data
+ tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone()
else:
if data.dtype == torch.float64 or not data.is_floating_point():
data = data.float()
diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index 84f16429db347ae8cd5ca1a6879e2011ce733693..d388d697fada86ae8378fb4b277e414b90c665bf 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -18,7 +18,7 @@
| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 |
| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明)。
"L1":dump API 级精度数据,默认值,仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"L2":dump kernel 级精度数据,PyTorch 场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore 动态图场景详细介绍见 [MindSpore 动态图场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);MindSpore 静态图场景详细介绍见《MindSpore 场景的数据采集》中的 ["**8.1 静态图场景**"](./06.data_dump_MindSpore.md#81-静态图场景)小节。
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"debug":单点保存功能,详见[单点保存工具](./28.debugger_save_instruction.md)。
**配置示例**:"level": "L1"。 | 否 |
| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 |
-| async_dump | 异步 dump 开关,bool 类型, 支持 task 为 tensor 或 statistic 模式, level 支持 L0、 L1、 mix、 debug 模式。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor 的统计量计算。
| 否 |
+| async_dump | 异步 dump 开关,bool 类型, 支持 task 为 tensor 或 statistic 模式, level 支持 L0、 L1、 mix、 debug 模式。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式下,summary_mode 不支持 md5 值,也不支持复数类型 tensor 的统计量计算。
| 否 |
#### 1.1.1 模块级精度数据 dump 说明
diff --git a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md
index 3b85320fa35f7ce151889e14f9d2b934b725ffa5..7f00aab045565b9df7f3de9add60cb85fb16a4f2 100644
--- a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md
+++ b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md
@@ -27,7 +27,7 @@ L0, L1, mix级别的dump能力存在盲区,网络中的非API或module的输
| dump_path | 设置 dump 数据目录路径,str 类型。 | 是 |
| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型。 | 否 |
| step | 指定采集某个 Step 的数据,list[Union[int, str]] 类型。 | 否 |
-| async_dump | 异步 dump 开关,bool 类型。 | 否 |
+| async_dump | 异步 dump 开关,bool 类型。该模式下,summary_mode 不支持 md5 值,也不支持复数类型 tensor 的统计量计算。 | 否 |
"statistics" 任务子配置项:
| 参数 | 解释 | 是否必选 |
diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py
index 90b84c90026da770798e4425f104b6d759f58943..be6a6da41cd775532e5e7610668fdf07a8d7ea3e 100644
--- a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py
+++ b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py
@@ -49,7 +49,7 @@ class DebuggerConfig:
self.summary_mode = task_config.summary_mode
self.stat_cal_mode = task_config.stat_cal_mode if hasattr(task_config, 'stat_cal_mode') else None
self.device_stat_precision_mode = task_config.device_stat_precision_mode \
- if hasattr(task_config, 'device_stat_precision_mode') else None
+ if hasattr(task_config, 'device_stat_precision_mode') else None
self.async_dump = common_config.async_dump if common_config.async_dump else False
self.check()
self._check_statistics_config(task_config)
@@ -115,18 +115,28 @@ class DebuggerConfig:
self.check_mode = "all"
if not isinstance(self.async_dump, bool):
raise Exception("The parameters async_dump should be bool.")
- if self.async_dump and self.task == Const.TENSOR:
- if self.level_ori == Const.LEVEL_DEBUG:
- self.list = [] # async_dump + debug level case ignore list
- if not self.list and self.level_ori != Const.LEVEL_DEBUG:
- raise Exception("The parameters async_dump is true in tensor task,"
- " the parameters list cannot be empty.")
if self.task == Const.STRUCTURE and self.level_ori not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
logger.warning_on_rank_0(
f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
f"If not, the default level is {Const.LEVEL_MIX}."
)
self.level_ori = Const.LEVEL_MIX
+ if self.async_dump:
+ if self.task == Const.TENSOR:
+ if self.level_ori == Const.LEVEL_DEBUG:
+ self.list = [] # async_dump + debug level case ignore list
+ if not self.list and self.level_ori != Const.LEVEL_DEBUG:
+ raise MsprobeException(
+ MsprobeException.INVALID_PARAM_ERROR,
+ "The parameters async_dump is true in tensor task, the parameters list cannot be empty."
+ )
+ is_unsupported_mode = isinstance(self.summary_mode, str) and self.summary_mode == Const.MD5 or \
+ isinstance(self.summary_mode, list) and Const.MD5 in self.summary_mode
+ if is_unsupported_mode:
+ raise MsprobeException(
+ MsprobeException.INVALID_PARAM_ERROR,
+ f"The parameters async_dump is true, the parameters summary_mode cannot be/contain md5."
+ )
return True
def check_config_with_l2(self, is_graph_config):
diff --git a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py
index ed5204c0d484b2d5165f5ab35799959dae538683..8e3bfbcaf83e3a17a31b6a9484fbf983802b765e 100644
--- a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py
+++ b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py
@@ -65,7 +65,7 @@ class DebuggerConfig:
self.is_backward_kernel_dump = False
self._check_and_adjust_config_with_l2()
- def check_kwargs(self):
+ def check(self):
if self.task and self.task not in Const.TASK_LIST:
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
f"The task <{self.task}> is not in the {Const.TASK_LIST}.")
@@ -78,22 +78,26 @@ class DebuggerConfig:
if not isinstance(self.async_dump, bool):
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
f"The parameters async_dump should be bool.")
- if self.async_dump and self.task == Const.TENSOR:
- if self.level == Const.LEVEL_DEBUG:
- self.list = [] # async_dump + debug level case ignore list
- if not self.list and self.level != Const.LEVEL_DEBUG:
- raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR,
- f"The parameters async_dump is true in tensor task, the parameters list cannot be "
- f"empty.")
if self.task == Const.STRUCTURE and self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]:
logger.warning_on_rank_0(
f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. "
f"If not, the default level is {Const.LEVEL_MIX}."
)
self.level = Const.LEVEL_MIX
-
- def check(self):
- self.check_kwargs()
+ if self.async_dump:
+ if self.task == Const.TENSOR:
+ if self.level == Const.LEVEL_DEBUG:
+ self.list = [] # async_dump + debug level case ignore list
+ if not self.list and self.level != Const.LEVEL_DEBUG:
+ raise MsprobeException(
+ MsprobeException.INVALID_PARAM_ERROR,
+ f"The parameters async_dump is true in tensor task, the parameters list cannot be empty."
+ )
+ if self.summary_mode == Const.MD5:
+ raise MsprobeException(
+ MsprobeException.INVALID_PARAM_ERROR,
+ f"The parameters async_dump is true, the parameters summary_mode cannot be md5."
+ )
return True
def check_model(self, instance, start_model, token_range=None):
@@ -102,7 +106,7 @@ class DebuggerConfig:
if token_range and not instance.model:
error_info = "The 'model' parameter must be provided when token_range is not None"
raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, error_info)
-
+
if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX] and token_range is None:
return
@@ -123,7 +127,7 @@ class DebuggerConfig:
break
if error_model is not None:
error_info = (f"The 'model' parameter must be a torch.nn.Module or list[torch.nn.Module] "
- f"type, currently there is an unsupported {type(error_model)} type.")
+ f"type, currently there is an unsupported {type(error_model)} type.")
raise MsprobeException(
MsprobeException.INVALID_PARAM_ERROR, error_info)
else:
diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py
index 1a7d04f9c6b316ef3eb9eb2336cb293191932c45..d79fe48a75c65e6156efdcdfce627d85e98c77ff 100644
--- a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py
+++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py
@@ -221,71 +221,6 @@ class TestDebuggerSave(unittest.TestCase):
debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
- @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions")
- def test_async_save_tensor(self, _):
- data = {"a": mindspore.Tensor([1., 2.])}
- step = []
- async_dump = True
- mode = "tensor"
- dump_path = os.path.join(test_dir, "debug_save")
- config_file_path = os.path.join(test_dir, "config.json")
- self.write_config_json(step, async_dump, mode, dump_path, config_file_path)
- debugger = PrecisionDebugger(config_file_path)
- PrecisionDebugger.save(data, "data_dict", save_backward=False)
- PrecisionDebugger.step()
- # check npy file
- npy_path = os.path.join(dump_path, "step0", "rank", "dump_tensor_data", "data_dict.0.debug.a.npy")
- assert self.check_real_npy(npy_path, data["a"])
- # check debug json
- target_debug_info = {
- "a": {
- "type": "mindspore.Tensor",
- "dtype": "Float32",
- "shape": [
- 2
- ],
- "data_name": "data_dict.0.debug.a.npy",
- "Max": 2.0,
- "Min": 1.0,
- "Mean": 1.5,
- "Norm": 2.2360680103302
- }
- }
- debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json")
- debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
- assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
-
- @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions")
- def test_async_save_md5(self, _):
- # async_dump case, md5 configuration not working,only save statistics
- data = {"a": mindspore.Tensor([1., 2.])}
- step = []
- async_dump = True
- mode = "md5"
- dump_path = os.path.join(test_dir, "debug_save")
- config_file_path = os.path.join(test_dir, "config.json")
- self.write_config_json(step, async_dump, mode, dump_path, config_file_path)
- debugger = PrecisionDebugger(config_file_path)
- PrecisionDebugger.save(data, "data_dict", save_backward=False)
- PrecisionDebugger.step()
- # check debug json
- target_debug_info = {
- "a": {
- "type": "mindspore.Tensor",
- "dtype": "Float32",
- "shape": [
- 2
- ],
- "Max": 2.0,
- "Min": 1.0,
- "Mean": 1.5,
- "Norm": 2.2360680103302
- }
- }
- debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json")
- debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
- assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
-
@patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions")
def test_save_multiple_times(self, _):
data = {"a": mindspore.Tensor([1., 2.])}
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py
index 1a0a97b95ec53ceb062987aa66ac7d78a53c8861..89dc690e27cf06cf976626c12965abc998879b6d 100644
--- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py
@@ -84,6 +84,7 @@ class TestDebuggerConfig(unittest.TestCase):
self.common_config.task = Const.TENSOR
self.common_config.level = Const.LEVEL_MIX
self.task_config.list = []
+ self.task_config.summary_mode = Const.SUMMARY_MODE
with self.assertRaises(MsprobeException) as context:
DebuggerConfig(self.common_config, self.task_config, None, None, None)
self.assertIn(f"the parameters list cannot be empty.", str(context.exception))
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py
index 8d2f4f3fbcf571e7a61fcef1f6a01118cb2db6df..ea2350295206fb475e106f03e13afdeeba25289c 100644
--- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py
@@ -244,37 +244,6 @@ class TestDebuggerSave(unittest.TestCase):
debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
- def test_async_save_md5(self):
- # async_dump case, md5 configuration not working,only save statistics
- data = {"a": torch.Tensor([1., 2.])}
- step = []
- async_dump = True
- mode = "md5"
- dump_path = os.path.join(test_dir, "debug_save")
- config_file_path = os.path.join(test_dir, "config.json")
- self.write_config_json(step, async_dump, mode, dump_path, config_file_path)
- debugger = PrecisionDebugger(config_file_path)
- PrecisionDebugger.save(data, "data_dict", save_backward=False)
- PrecisionDebugger.step()
- # check debug json
- target_debug_info = {
- "a": {
- "type": "torch.Tensor",
- "dtype": "torch.float32",
- "shape": [
- 2
- ],
- "Max": 2.0,
- "Min": 1.0,
- "Mean": 1.5,
- "Norm": 2.2360680103302,
- "requires_grad": False,
- }
- }
- debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json")
- debug_json_dict = self.read_debug_json_into_dict(debug_json_path)
- assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info)
-
def test_save_multiple_times(self):
data = {"a": torch.Tensor([1., 2.])}
step = []