From 7f57ddb49e29c9228b33f60e2061fb66b5e4ee46 Mon Sep 17 00:00:00 2001 From: l30036321 Date: Tue, 7 Nov 2023 20:34:06 +0800 Subject: [PATCH 1/3] dump distributed in-place api input --- .../src/python/ptdbg_ascend/common/utils.py | 9 +++++++++ .../src/python/ptdbg_ascend/dump/dump.py | 13 +++++++++++-- .../ptdbg_ascend/hook_module/hook_module.py | 18 +++++++++--------- .../hook_module/wrap_distributed.py | 4 +++- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index f2361a90b7..7c1dc362bf 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -72,6 +72,7 @@ class Const: OFF = 'OFF' BACKWARD = 'backward' FORWARD = 'forward' + PRE_FORWARD = "pre_forward" # dump mode ALL = "all" @@ -103,6 +104,8 @@ class Const: MAX_SEED_VALUE = 2**32 - 1 + INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter"] + class CompareConst: """ @@ -669,3 +672,9 @@ def check_path_before_create(path): if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): print_error_log('The file path {} contains special characters.'.format(path)) raise CompareException(CompareException.INVALID_PATH_ERROR) + + +def check_inplace_op(prefix): + match_op = re.findall(r"Distributed_(.+?)_\d", prefix) + op_name = match_op[0] if match_op else None + return op_name in Const.INPLACE_LIST diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 5238754cce..68c29ef0a4 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -32,7 +32,7 @@ else: is_gpu = False from .utils import DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist -from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path +from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op from ..dump.utils import check_writable from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_vaild, check_path_length @@ -131,6 +131,12 @@ def dump_tensor(x, prefix, dump_step, dump_file_name): def dump_data(dump_file_name, dump_step, prefix, data_info): + if check_inplace_op(prefix): + if Const.PRE_FORWARD in prefix: + prefix = prefix.replace(Const.PRE_FORWARD, Const.FORWARD) + else: + prefix = prefix.replace("input", "output") + global api_list thread_lock.acquire() try: @@ -150,6 +156,9 @@ def dump_data(dump_file_name, dump_step, prefix, data_info): def dump_stack_info(name_template, dump_file): + if check_inplace_op(name_template) and Const.PRE_FORWARD in name_template: + return + stack_str = [] try: for (_, path, line, func, code, _) in inspect.stack()[4:]: @@ -336,7 +345,7 @@ def acc_cmp_dump(name, **kwargs): if not pid: return RuntimeError("Not get the specified process pid.") - def acc_cmp_hook(module, in_feat, out_feat): + def acc_cmp_hook(module, in_feat, out_feat=None): if pid == os.getpid(): dump_acc_cmp(name, in_feat, out_feat, dump_step, module) if hasattr(module, "input_args"): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py index 83f7dcacef..55b97208a6 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py @@ -33,21 +33,21 @@ class HOOKModule(nn.Module): self.has_overflow = False self.input_args = tuple() self.input_kwargs = dict() + self.prefix = "" if not g_stop_hook: - prefix = "" if hasattr(self, "prefix_op_name_"): - prefix = self.prefix_op_name_ + self.prefix = self.prefix_op_name_ - if prefix not in HOOKModule.module_count: - HOOKModule.module_count[prefix] = 1 - prefix += '0_' + if self.prefix not in HOOKModule.module_count: + HOOKModule.module_count[self.prefix] = 1 + self.prefix += '0_' else: - HOOKModule.module_count[prefix] += 1 - prefix = prefix + str(HOOKModule.module_count[prefix] - 1) + '_' + HOOKModule.module_count[self.prefix] += 1 + self.prefix = self.prefix + str(HOOKModule.module_count[self.prefix] - 1) + '_' - self.register_forward_hook(hook(prefix + "forward")) - self.register_backward_hook(hook(prefix + "backward")) + self.register_forward_hook(hook(self.prefix + "forward")) + self.register_backward_hook(hook(self.prefix + "backward")) def __call__(self, *input, **kwargs): changed = False diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py index 7aa21770ce..ed28dbe5d6 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py @@ -21,7 +21,7 @@ import torch.distributed as dist import yaml from .hook_module import HOOKModule -from ..common.utils import torch_device_guard +from ..common.utils import torch_device_guard, Const from ..common.file_check_util import FileOpen @@ -51,6 +51,8 @@ class DistributedOPTemplate(HOOKModule): self.op_name_ = op_name self.prefix_op_name_ = "Distributed_" + str(op_name) + "_" super().__init__(hook) + if self.op_name_ in Const.INPLACE_LIST: + self.register_forward_pre_hook(hook(self.prefix + Const.PRE_FORWARD)) @torch_device_guard def forward(self, *args, **kwargs): -- Gitee From 0bd1e7eed47f006e5a1dcc76ad08e342f8ab4aba Mon Sep 17 00:00:00 2001 From: l30036321 Date: Wed, 8 Nov 2023 10:11:58 +0800 Subject: [PATCH 2/3] dump distributed in-place api input --- .../src/python/ptdbg_ascend/dump/dump.py | 34 +++++++++---------- .../ptdbg_ascend/overflow_check/utils.py | 20 +++++------ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 68c29ef0a4..0c603782a3 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -104,11 +104,11 @@ def json_dump_condition(prefix): return (Const.BACKWARD in prefix and backward_threading_id == cur_threading_id) or 'forward' in prefix -def dump_tensor(x, prefix, dump_step, dump_file_name): +def dump_tensor(x, prefix, dump_step): global data_info if isinstance(x, (tuple, list)) and x: for i, item in enumerate(x): - dump_tensor(item, "{}.{}".format(prefix, i), dump_step, dump_file_name) + dump_tensor(item, "{}.{}".format(prefix, i), dump_step) return elif isinstance(x, torch.Tensor): if x.is_meta: @@ -117,20 +117,20 @@ def dump_tensor(x, prefix, dump_step, dump_file_name): if x.numel() == 0 or len(x.shape) == 0 or not x.is_floating_point(): if DumpUtil.dump_filter_switch == Const.OFF: data_info = get_not_float_tensor_info(x) - dump_data(dump_file_name, dump_step, prefix, data_info) + dump_data(dump_step, prefix, data_info) else: return else: data_info = get_float_tensor_info(x) - dump_data(dump_file_name, dump_step, prefix, data_info) + dump_data(dump_step, prefix, data_info) elif DumpUtil.dump_filter_switch == Const.OFF: if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): data_info = get_scalar_data_info(x) - dump_data(dump_file_name, dump_step, prefix, data_info) + dump_data(dump_step, prefix, data_info) -def dump_data(dump_file_name, dump_step, prefix, data_info): +def dump_data(dump_step, prefix, data_info): if check_inplace_op(prefix): if Const.PRE_FORWARD in prefix: prefix = prefix.replace(Const.PRE_FORWARD, Const.FORWARD) @@ -155,7 +155,7 @@ def dump_data(dump_file_name, dump_step, prefix, data_info): thread_lock.release() -def dump_stack_info(name_template, dump_file): +def dump_stack_info(name_template): if check_inplace_op(name_template) and Const.PRE_FORWARD in name_template: return @@ -181,17 +181,17 @@ def dump_stack_info(name_template, dump_file): api_list.append([prefix, stack_str]) -def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file): +def dump_api_tensor(dump_step, in_feat, name_template, out_feat): if Const.BACKWARD in name_template and Const.BACKWARD in DumpUtil.dump_mode: if 'input' in DumpUtil.dump_mode: - dump_tensor(out_feat, name_template.format("input"), dump_step, dump_file) + dump_tensor(out_feat, name_template.format("input"), dump_step) if 'output' in DumpUtil.dump_mode: - dump_tensor(in_feat, name_template.format("output"), dump_step, dump_file) + dump_tensor(in_feat, name_template.format("output"), dump_step) elif Const.BACKWARD not in name_template and Const.FORWARD in DumpUtil.dump_mode: if 'input' in DumpUtil.dump_mode: - dump_tensor(in_feat, name_template.format("input"), dump_step, dump_file) + dump_tensor(in_feat, name_template.format("input"), dump_step) if 'output' in DumpUtil.dump_mode: - dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file) + dump_tensor(out_feat, name_template.format("output"), dump_step) def rename_(): @@ -256,16 +256,16 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): name_prefix = name name_template = f"{name_prefix}" + "_{}" if DumpUtil.dump_switch_mode in [Const.ALL, Const.API_LIST]: - dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file) + dump_api_tensor(dump_step, in_feat, name_template, out_feat) elif DumpUtil.dump_switch_mode == Const.API_STACK: - dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file) - dump_stack_info(name_template, dump_file) + dump_api_tensor(dump_step, in_feat, name_template, out_feat) + dump_stack_info(name_template) elif DumpUtil.check_switch_scope(name_prefix): if DumpUtil.dump_switch_mode == Const.ACL: acl_dump(module, name, name_prefix) elif DumpUtil.dump_switch_mode != Const.STACK: - dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file) - dump_stack_info(name_template, dump_file) + dump_api_tensor(dump_step, in_feat, name_template, out_feat) + dump_stack_info(name_template) def acl_dump(module, module_name, name_prefix): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py index 1a26c5ac04..9eef5d6432 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py @@ -44,30 +44,30 @@ def set_overflow_check_switch(switch, filter_switch=Const.OFF): def dump_overflow(module_name, in_feat, out_feat, dump_file): name_template = f"{module_name}" + "_{}" DumpUtil.dump_data_dir = make_dump_data_dir(dump_file) - dump_stack_info(name_template, dump_file) + dump_stack_info(name_template) if "forward" in name_template: - _dump_tensor_completely(in_feat, name_template.format("input"), dump_file) - _dump_tensor_completely(out_feat, name_template.format("output"), dump_file) + _dump_tensor_completely(in_feat, name_template.format("input")) + _dump_tensor_completely(out_feat, name_template.format("output")) else: - _dump_tensor_completely(in_feat, name_template.format("output"), dump_file) - _dump_tensor_completely(out_feat, name_template.format("input"), dump_file) + _dump_tensor_completely(in_feat, name_template.format("output")) + _dump_tensor_completely(out_feat, name_template.format("input")) -def _dump_tensor_completely(x, prefix, dump_file_name): +def _dump_tensor_completely(x, prefix): dump_flag = Const.DUMP_RATIO_MAX + 1 if isinstance(x, (tuple, list)) and x: for i, item in enumerate(x): - _dump_tensor_completely(item, "{}.{}".format(prefix, i), dump_file_name) + _dump_tensor_completely(item, "{}.{}".format(prefix, i)) elif isinstance(x, torch.Tensor): if x.numel() == 0 or len(x.shape) == 0 or not x.is_floating_point(): if OverFlowUtil.overflow_filter_switch == Const.OFF: data_info = get_not_float_tensor_info(x) - dump_data(dump_file_name, dump_flag, prefix, data_info) + dump_data(dump_flag, prefix, data_info) else: data_info = get_float_tensor_info(x) - dump_data(dump_file_name, dump_flag, prefix, data_info) + dump_data(dump_flag, prefix, data_info) elif OverFlowUtil.overflow_filter_switch == Const.OFF: if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): data_info = get_scalar_data_info(x) - dump_data(dump_file_name, dump_flag, prefix, data_info) + dump_data(dump_flag, prefix, data_info) -- Gitee From 96c7ce4d5401d6790ce27385844dfe3547206ad0 Mon Sep 17 00:00:00 2001 From: l30036321 Date: Thu, 9 Nov 2023 10:55:45 +0800 Subject: [PATCH 3/3] dump distributed in-place api input --- .../src/python/ptdbg_ascend/common/utils.py | 4 +++- .../src/python/ptdbg_ascend/dump/dump.py | 13 +++++++------ .../src/python/ptdbg_ascend/overflow_check/utils.py | 9 ++++++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index 7c1dc362bf..6e63b7cf88 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -97,7 +97,7 @@ class Const: FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$' FILE_NAME_LENGTH = 255 DIRECTORY_LENGTH = 4096 - + DISTRIBUTED_PREFIX_LENGTH = 60 # env dump path ASCEND_WORK_PATH = "ASCEND_WORK_PATH" DUMP_DIR = "dump_data" @@ -675,6 +675,8 @@ def check_path_before_create(path): def check_inplace_op(prefix): + if len(prefix) > Const.DISTRIBUTED_PREFIX_LENGTH: + return False match_op = re.findall(r"Distributed_(.+?)_\d", prefix) op_name = match_op[0] if match_op else None return op_name in Const.INPLACE_LIST diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 0c603782a3..2a5ad3658c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -131,12 +131,6 @@ def dump_tensor(x, prefix, dump_step): def dump_data(dump_step, prefix, data_info): - if check_inplace_op(prefix): - if Const.PRE_FORWARD in prefix: - prefix = prefix.replace(Const.PRE_FORWARD, Const.FORWARD) - else: - prefix = prefix.replace("input", "output") - global api_list thread_lock.acquire() try: @@ -182,6 +176,13 @@ def dump_stack_info(name_template): def dump_api_tensor(dump_step, in_feat, name_template, out_feat): + if check_inplace_op(name_template): + if Const.PRE_FORWARD in name_template: + name_template = name_template.replace(Const.PRE_FORWARD, Const.FORWARD) + else: + dump_tensor(in_feat, name_template.format("output"), dump_step) + return + if Const.BACKWARD in name_template and Const.BACKWARD in DumpUtil.dump_mode: if 'input' in DumpUtil.dump_mode: dump_tensor(out_feat, name_template.format("input"), dump_step) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py index 9eef5d6432..6af477753d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py @@ -1,6 +1,6 @@ import torch -from ..common.utils import Const, check_switch_valid +from ..common.utils import Const, check_switch_valid, check_inplace_op from ..dump.dump import dump_stack_info, get_scalar_data_info, dump_data, \ get_not_float_tensor_info, get_float_tensor_info from ..dump.utils import DumpUtil, make_dump_data_dir @@ -45,6 +45,13 @@ def dump_overflow(module_name, in_feat, out_feat, dump_file): name_template = f"{module_name}" + "_{}" DumpUtil.dump_data_dir = make_dump_data_dir(dump_file) dump_stack_info(name_template) + if check_inplace_op(name_template): + if Const.PRE_FORWARD in name_template: + name_template = name_template.replace(Const.PRE_FORWARD, Const.FORWARD) + else: + _dump_tensor_completely(in_feat, name_template.format("output")) + return + if "forward" in name_template: _dump_tensor_completely(in_feat, name_template.format("input")) _dump_tensor_completely(out_feat, name_template.format("output")) -- Gitee