diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index c33ca941abb7954c7dc3066598bffba60ff4cdb9..f5c2b89a1b49e881e601e895bdfcdb9e59a51acf 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -73,7 +73,7 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_stat.max = np.max(data_np).item() tensor_stat.min = np.min(data_np).item() elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy() elif data.dtype == ms.complex64 or data.dtype == ms.complex128: data_abs = np.abs(data.asnumpy()) tensor_stat.max = np.max(data_abs).item() @@ -97,7 +97,7 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_stat.max = mint.any(data) tensor_stat.min = mint.all(data) elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.copy() elif data.dtype == ms.complex64 or data.dtype == ms.complex128: logger.warning("Async dump do not support complex data!") return tensor_stat diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index cc170d32ed0393250246fea7a92e52059745ba1a..a4141754981be151fca995bfc91955530bc8d933 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -104,7 +104,7 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_stat.max = torch.any(data) tensor_stat.min = torch.all(data) elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone() else: if data.dtype == torch.float64 or not data.is_floating_point(): data = data.float() @@ -127,7 +127,7 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_stat.max = torch.any(data) tensor_stat.min = torch.all(data) elif not data.shape: - tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data + tensor_stat.max = tensor_stat.min = tensor_stat.mean = tensor_stat.norm = data.clone() else: if data.dtype == torch.float64 or not data.is_floating_point(): data = data.float() diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md index 84f16429db347ae8cd5ca1a6879e2011ce733693..d388d697fada86ae8378fb4b277e414b90c665bf 100644 --- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md +++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md @@ -18,7 +18,7 @@ | step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 | | level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明)。
"L1":dump API 级精度数据,默认值,仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"L2":dump kernel 级精度数据,PyTorch 场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore 动态图场景详细介绍见 [MindSpore 动态图场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);MindSpore 静态图场景详细介绍见《MindSpore 场景的数据采集》中的 ["**8.1 静态图场景**"](./06.data_dump_MindSpore.md#81-静态图场景)小节。
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"debug":单点保存功能,详见[单点保存工具](./28.debugger_save_instruction.md)。
**配置示例**:"level": "L1"。 | 否 | | enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 | -| async_dump | 异步 dump 开关,bool 类型, 支持 task 为 tensor 或 statistic 模式, level 支持 L0、 L1、 mix、 debug 模式。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor 的统计量计算。
| 否 | +| async_dump | 异步 dump 开关,bool 类型, 支持 task 为 tensor 或 statistic 模式, level 支持 L0、 L1、 mix、 debug 模式。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式下,summary_mode 不支持 md5 值,也不支持复数类型 tensor 的统计量计算。
| 否 | #### 1.1.1 模块级精度数据 dump 说明 diff --git a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md index 3b85320fa35f7ce151889e14f9d2b934b725ffa5..7f00aab045565b9df7f3de9add60cb85fb16a4f2 100644 --- a/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md +++ b/debug/accuracy_tools/msprobe/docs/28.debugger_save_instruction.md @@ -27,7 +27,7 @@ L0, L1, mix级别的dump能力存在盲区,网络中的非API或module的输 | dump_path | 设置 dump 数据目录路径,str 类型。 | 是 | | rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型。 | 否 | | step | 指定采集某个 Step 的数据,list[Union[int, str]] 类型。 | 否 | -| async_dump | 异步 dump 开关,bool 类型。 | 否 | +| async_dump | 异步 dump 开关,bool 类型。该模式下,summary_mode 不支持 md5 值,也不支持复数类型 tensor 的统计量计算。 | 否 | "statistics" 任务子配置项: | 参数 | 解释 | 是否必选 | diff --git a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py index 90b84c90026da770798e4425f104b6d759f58943..be6a6da41cd775532e5e7610668fdf07a8d7ea3e 100644 --- a/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py +++ b/debug/accuracy_tools/msprobe/mindspore/debugger/debugger_config.py @@ -49,7 +49,7 @@ class DebuggerConfig: self.summary_mode = task_config.summary_mode self.stat_cal_mode = task_config.stat_cal_mode if hasattr(task_config, 'stat_cal_mode') else None self.device_stat_precision_mode = task_config.device_stat_precision_mode \ - if hasattr(task_config, 'device_stat_precision_mode') else None + if hasattr(task_config, 'device_stat_precision_mode') else None self.async_dump = common_config.async_dump if common_config.async_dump else False self.check() self._check_statistics_config(task_config) @@ -115,18 +115,28 @@ class DebuggerConfig: self.check_mode = "all" if not isinstance(self.async_dump, bool): raise Exception("The parameters async_dump should be bool.") - if self.async_dump and self.task == Const.TENSOR: - if self.level_ori == Const.LEVEL_DEBUG: - self.list = [] # async_dump + debug level case ignore list - if not self.list and self.level_ori != Const.LEVEL_DEBUG: - raise Exception("The parameters async_dump is true in tensor task," - " the parameters list cannot be empty.") if self.task == Const.STRUCTURE and self.level_ori not in [Const.LEVEL_L0, Const.LEVEL_MIX]: logger.warning_on_rank_0( f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. " f"If not, the default level is {Const.LEVEL_MIX}." ) self.level_ori = Const.LEVEL_MIX + if self.async_dump: + if self.task == Const.TENSOR: + if self.level_ori == Const.LEVEL_DEBUG: + self.list = [] # async_dump + debug level case ignore list + if not self.list and self.level_ori != Const.LEVEL_DEBUG: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + "The parameters async_dump is true in tensor task, the parameters list cannot be empty." + ) + is_unsupported_mode = isinstance(self.summary_mode, str) and self.summary_mode == Const.MD5 or \ + isinstance(self.summary_mode, list) and Const.MD5 in self.summary_mode + if is_unsupported_mode: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + f"The parameters async_dump is true, the parameters summary_mode cannot be/contain md5." + ) return True def check_config_with_l2(self, is_graph_config): diff --git a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py index ed5204c0d484b2d5165f5ab35799959dae538683..8e3bfbcaf83e3a17a31b6a9484fbf983802b765e 100644 --- a/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py +++ b/debug/accuracy_tools/msprobe/pytorch/debugger/debugger_config.py @@ -65,7 +65,7 @@ class DebuggerConfig: self.is_backward_kernel_dump = False self._check_and_adjust_config_with_l2() - def check_kwargs(self): + def check(self): if self.task and self.task not in Const.TASK_LIST: raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"The task <{self.task}> is not in the {Const.TASK_LIST}.") @@ -78,22 +78,26 @@ class DebuggerConfig: if not isinstance(self.async_dump, bool): raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, f"The parameters async_dump should be bool.") - if self.async_dump and self.task == Const.TENSOR: - if self.level == Const.LEVEL_DEBUG: - self.list = [] # async_dump + debug level case ignore list - if not self.list and self.level != Const.LEVEL_DEBUG: - raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, - f"The parameters async_dump is true in tensor task, the parameters list cannot be " - f"empty.") if self.task == Const.STRUCTURE and self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX]: logger.warning_on_rank_0( f"When the task is set to structure, the level should be one of {[Const.LEVEL_L0, Const.LEVEL_MIX]}. " f"If not, the default level is {Const.LEVEL_MIX}." ) self.level = Const.LEVEL_MIX - - def check(self): - self.check_kwargs() + if self.async_dump: + if self.task == Const.TENSOR: + if self.level == Const.LEVEL_DEBUG: + self.list = [] # async_dump + debug level case ignore list + if not self.list and self.level != Const.LEVEL_DEBUG: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + f"The parameters async_dump is true in tensor task, the parameters list cannot be empty." + ) + if self.summary_mode == Const.MD5: + raise MsprobeException( + MsprobeException.INVALID_PARAM_ERROR, + f"The parameters async_dump is true, the parameters summary_mode cannot be md5." + ) return True def check_model(self, instance, start_model, token_range=None): @@ -102,7 +106,7 @@ class DebuggerConfig: if token_range and not instance.model: error_info = "The 'model' parameter must be provided when token_range is not None" raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, error_info) - + if self.level not in [Const.LEVEL_L0, Const.LEVEL_MIX] and token_range is None: return @@ -123,7 +127,7 @@ class DebuggerConfig: break if error_model is not None: error_info = (f"The 'model' parameter must be a torch.nn.Module or list[torch.nn.Module] " - f"type, currently there is an unsupported {type(error_model)} type.") + f"type, currently there is an unsupported {type(error_model)} type.") raise MsprobeException( MsprobeException.INVALID_PARAM_ERROR, error_info) else: diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py index 1a7d04f9c6b316ef3eb9eb2336cb293191932c45..d79fe48a75c65e6156efdcdfce627d85e98c77ff 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/save/test_debugger_save_mindspore.py @@ -221,71 +221,6 @@ class TestDebuggerSave(unittest.TestCase): debug_json_dict = self.read_debug_json_into_dict(debug_json_path) assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions") - def test_async_save_tensor(self, _): - data = {"a": mindspore.Tensor([1., 2.])} - step = [] - async_dump = True - mode = "tensor" - dump_path = os.path.join(test_dir, "debug_save") - config_file_path = os.path.join(test_dir, "config.json") - self.write_config_json(step, async_dump, mode, dump_path, config_file_path) - debugger = PrecisionDebugger(config_file_path) - PrecisionDebugger.save(data, "data_dict", save_backward=False) - PrecisionDebugger.step() - # check npy file - npy_path = os.path.join(dump_path, "step0", "rank", "dump_tensor_data", "data_dict.0.debug.a.npy") - assert self.check_real_npy(npy_path, data["a"]) - # check debug json - target_debug_info = { - "a": { - "type": "mindspore.Tensor", - "dtype": "Float32", - "shape": [ - 2 - ], - "data_name": "data_dict.0.debug.a.npy", - "Max": 2.0, - "Min": 1.0, - "Mean": 1.5, - "Norm": 2.2360680103302 - } - } - debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json") - debug_json_dict = self.read_debug_json_into_dict(debug_json_path) - assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - - @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions") - def test_async_save_md5(self, _): - # async_dump case, md5 configuration not working,only save statistics - data = {"a": mindspore.Tensor([1., 2.])} - step = [] - async_dump = True - mode = "md5" - dump_path = os.path.join(test_dir, "debug_save") - config_file_path = os.path.join(test_dir, "config.json") - self.write_config_json(step, async_dump, mode, dump_path, config_file_path) - debugger = PrecisionDebugger(config_file_path) - PrecisionDebugger.save(data, "data_dict", save_backward=False) - PrecisionDebugger.step() - # check debug json - target_debug_info = { - "a": { - "type": "mindspore.Tensor", - "dtype": "Float32", - "shape": [ - 2 - ], - "Max": 2.0, - "Min": 1.0, - "Mean": 1.5, - "Norm": 2.2360680103302 - } - } - debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json") - debug_json_dict = self.read_debug_json_into_dict(debug_json_path) - assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - @patch("msprobe.mindspore.debugger.precision_debugger.set_register_backward_hook_functions") def test_save_multiple_times(self, _): data = {"a": mindspore.Tensor([1., 2.])} diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py index 1a0a97b95ec53ceb062987aa66ac7d78a53c8861..89dc690e27cf06cf976626c12965abc998879b6d 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger/test_pt_debugger_config.py @@ -84,6 +84,7 @@ class TestDebuggerConfig(unittest.TestCase): self.common_config.task = Const.TENSOR self.common_config.level = Const.LEVEL_MIX self.task_config.list = [] + self.task_config.summary_mode = Const.SUMMARY_MODE with self.assertRaises(MsprobeException) as context: DebuggerConfig(self.common_config, self.task_config, None, None, None) self.assertIn(f"the parameters list cannot be empty.", str(context.exception)) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py index 8d2f4f3fbcf571e7a61fcef1f6a01118cb2db6df..ea2350295206fb475e106f03e13afdeeba25289c 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/debugger_save/test_debugger_save_pytorch.py @@ -244,37 +244,6 @@ class TestDebuggerSave(unittest.TestCase): debug_json_dict = self.read_debug_json_into_dict(debug_json_path) assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - def test_async_save_md5(self): - # async_dump case, md5 configuration not working,only save statistics - data = {"a": torch.Tensor([1., 2.])} - step = [] - async_dump = True - mode = "md5" - dump_path = os.path.join(test_dir, "debug_save") - config_file_path = os.path.join(test_dir, "config.json") - self.write_config_json(step, async_dump, mode, dump_path, config_file_path) - debugger = PrecisionDebugger(config_file_path) - PrecisionDebugger.save(data, "data_dict", save_backward=False) - PrecisionDebugger.step() - # check debug json - target_debug_info = { - "a": { - "type": "torch.Tensor", - "dtype": "torch.float32", - "shape": [ - 2 - ], - "Max": 2.0, - "Min": 1.0, - "Mean": 1.5, - "Norm": 2.2360680103302, - "requires_grad": False, - } - } - debug_json_path = os.path.join(dump_path, "step0", "rank", "debug.json") - debug_json_dict = self.read_debug_json_into_dict(debug_json_path) - assert deep_compare(debug_json_dict["data"]["data_dict.0.debug"], target_debug_info) - def test_save_multiple_times(self): data = {"a": torch.Tensor([1., 2.])} step = []