diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index d8ead53e7a3a61595e503bc0f9163dfb1f32018e..31a927db3ab5423d17ce14c27d488dba429e9005 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -23,6 +23,7 @@ import stat import subprocess import sys import time +import hashlib from datetime import datetime, timezone from functools import wraps from pathlib import Path @@ -733,6 +734,12 @@ def check_file_valid(file_path): raise CompareException(CompareException.INVALID_PATH_ERROR) +def get_md5_for_tensor(x): + tensor_bytes = x.cpu().detach().float().numpy().tobytes() + md5_hash = hashlib.md5(tensor_bytes) + return md5_hash.hexdigest() + + def check_path_before_create(path): if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \ Const.FILE_NAME_LENGTH: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py index e8eb2adb82a95c47e60af2fb4bfbc9e21c9b7b1d..3201861db1c25168e4a763252184706ffbb9964e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py @@ -1,7 +1,7 @@ import os import torch from ..common.utils import Const, check_switch_valid, generate_compare_script, check_is_npu, print_error_log, \ - CompareException + CompareException, print_warn_log from ..dump.dump import DumpUtil, acc_cmp_dump, write_to_disk, get_pkl_file_path from ..dump.utils import set_dump_path, set_dump_switch_print_info, generate_dump_path_str, \ set_dump_switch_config, set_backward_input @@ -44,12 +44,18 @@ class PrecisionDebugger: return hook_dict.get(hook_name, lambda: ValueError("hook name {} is not in ['dump', 'overflow_check']".format(hook_name))) def configure_full_dump(self, mode='api_stack', scope=None, api_list=None, filter_switch=Const.OFF, - input_output_mode=[Const.ALL], acl_config=None, backward_input=None, summary_only=False): + input_output_mode=[Const.ALL], acl_config=None, backward_input=None, summary_only=False, summary_mode="all"): scope = scope or [] api_list = api_list or [] backward_input = backward_input or [] + + if summary_only: + print_warn_log("The argument 'summary_only' will be deprecated, it would be better to use 'summary_mode'") + summary_mode = "summary" + set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, - filter_switch=filter_switch, dump_mode=input_output_mode, summary_only=summary_only) + filter_switch=filter_switch, dump_mode=input_output_mode, summary_only=summary_only, + summary_mode=summary_mode) if mode == 'acl': DumpUtil.set_acl_config(acl_config) if not scope or not isinstance(scope, list) or len(scope) != 1: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index f0e041a4a4515eef021afcd22f496ab07be40dbb..b7f14304a48085484adc986480f792548679d85d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -33,7 +33,8 @@ else: is_gpu = False from .utils import DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist -from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op, CompareConst +from ..common.utils import (print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op, CompareConst, + get_md5_for_tensor) from ..dump.utils import check_writable from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_vaild, check_path_length @@ -49,11 +50,12 @@ module_count = defaultdict(int) class DataInfo(object): - def __init__(self, save_data, summary_data, dtype, shape): + def __init__(self, save_data, summary_data, dtype, shape, md5=""): self.save_data = save_data self.summary_data = summary_data self.dtype = dtype self.shape = shape + self.md5 = md5 def get_not_float_tensor_info(data): @@ -78,6 +80,8 @@ def get_scalar_data_info(data): def get_float_tensor_info(data): + if DumpUtil.summary_mode == "md5": + return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data)) tensor_max = torch._C._VariableFunctionsClass.max(data).cpu().detach().float().numpy().tolist() tensor_min = torch._C._VariableFunctionsClass.min(data).cpu().detach().float().numpy().tolist() tensor_mean = torch._C._VariableFunctionsClass.mean(data).cpu().detach().float().numpy().tolist() @@ -88,7 +92,7 @@ def get_float_tensor_info(data): def get_tensor_data_info(data, *tensor_args): summary_data = [] summary_data.extend([*tensor_args]) - if not DumpUtil.summary_only: + if DumpUtil.summary_mode == "all": saved_tensor = data.contiguous().cpu().detach() if data.dtype == torch.bfloat16: saved_numpy = saved_tensor.to(torch.float32).numpy() @@ -130,10 +134,10 @@ def dump_data(dump_step, prefix, data_info): output_path = os.path.join(DumpUtil.dump_data_dir, f'{prefix}.npy') check_path_length(output_path) check_path_pattern_vaild(output_path) - if not DumpUtil.summary_only: + if DumpUtil.summary_mode == "all": np.save(output_path, data_info.save_data) change_mode(output_path, FileCheckConst.DATA_FILE_AUTHORITY) - api_list.append([prefix, dump_step, [], data_info.dtype, data_info.shape, data_info.summary_data]) + api_list.append([prefix, dump_step, data_info.md5, data_info.dtype, data_info.shape, data_info.summary_data]) print_info_log(f"ptdbg is analyzing rank{rank} api: {prefix}" + " " * 10, end='\r') except Exception as e: print_warn_log("Dump data failed, error: {}".format(e)) @@ -240,7 +244,7 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): if DumpUtil.dump_init_enable: DumpUtil.dump_init_enable = False DumpUtil.dump_data_dir = make_dump_data_dir(dump_file) \ - if DumpUtil.dump_switch_mode not in [Const.STACK, Const.ACL] and not DumpUtil.summary_only else "" + if DumpUtil.dump_switch_mode not in [Const.STACK, Const.ACL] and DumpUtil.summary_mode == "all" else "" if os.path.exists(dump_file) and not os.path.isdir(dump_file): check_writable(dump_file) try: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index 5ab2152479801e99268f67027db375e266178e4c..c81055504a946775ecd4f33dc9de56f01a26ce1c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -9,7 +9,7 @@ import torch.distributed as dist from ..dump import dump from ..common.utils import print_error_log, CompareException, DumpException, Const, get_time, print_info_log, \ check_mode_valid, get_api_name_from_matcher, check_switch_valid, check_dump_mode_valid, check_summary_only_valid, generate_compare_script, \ - check_is_npu, check_file_valid, make_dump_path_if_not_exists, check_path_before_create + check_is_npu, check_file_valid, make_dump_path_if_not_exists, check_path_before_create, print_warn_log from ..common.file_check_util import FileChecker, FileCheckConst, check_path_length, check_path_pattern_vaild from ..common.version import __version__ @@ -74,6 +74,7 @@ class DumpUtil(object): target_rank = None summary_only = False need_replicate = False + summary_mode = "all" @staticmethod def set_dump_path(save_path): @@ -90,7 +91,8 @@ class DumpUtil(object): DumpUtil.dump_config = acl_config @staticmethod - def set_dump_switch(switch, mode=None, scope=None, api_list=None, filter_switch=None, dump_mode=None, summary_only=False): + def set_dump_switch(switch, mode=None, scope=None, api_list=None, filter_switch=None, dump_mode=None, + summary_only=False, summary_mode="all"): DumpUtil.dump_switch = switch if mode is not None: DumpUtil.dump_switch_mode = mode @@ -106,7 +108,9 @@ class DumpUtil(object): if mode == Const.ACL: DumpUtil.dump_switch_scope = [api_name.replace("backward", "forward") for api_name in scope] + DumpUtil.summary_only = summary_only + DumpUtil.summary_mode = summary_mode check_mapper = { Const.LIST: check_list_or_acl_mode, @@ -227,7 +231,7 @@ def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, filter_sw def set_dump_switch_config(mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.OFF, dump_mode=None, - summary_only=False): + summary_only=False, summary_mode="all"): if scope is None: scope = [] if api_list is None: @@ -244,7 +248,7 @@ def set_dump_switch_config(mode=Const.ALL, scope=None, api_list=None, filter_swi raise CompareException(CompareException.INVALID_PARAM_ERROR) from err switch = DumpUtil.dump_switch DumpUtil.set_dump_switch("OFF", mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, - dump_mode=dump_mode, summary_only=summary_only) + dump_mode=dump_mode, summary_only=summary_only, summary_mode=summary_mode) DumpUtil.dump_switch = switch