diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index f2361a90b76acead7c432271dbbcc22cd073c812..72ad3432f2366e839b00653d7678212e21f2879e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -23,6 +23,7 @@ import stat import subprocess import sys import time +import hashlib from datetime import datetime, timezone from functools import wraps from pathlib import Path @@ -101,7 +102,7 @@ class Const: ASCEND_WORK_PATH = "ASCEND_WORK_PATH" DUMP_DIR = "dump_data" - MAX_SEED_VALUE = 2**32 - 1 + MAX_SEED_VALUE = 2 ** 32 - 1 class CompareConst: @@ -199,6 +200,7 @@ class CompareException(Exception): def __str__(self): return self.error_info + class DumpException(CompareException): pass @@ -264,11 +266,18 @@ def check_mode_valid(mode, scope=None, api_list=None): raise ValueError("api_list param set invalid, it's must be a list.") mode_check = { Const.ALL: lambda: None, - Const.RANGE: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(scope) != 2 else None, - Const.LIST: lambda: ValueError("set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None, - Const.STACK: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None, - Const.ACL: lambda: ValueError("set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(scope) != 1 else None, - Const.API_LIST: lambda: ValueError("Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len(api_list) < 1 else None, + Const.RANGE: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len( + scope) != 2 else None, + Const.LIST: lambda: ValueError( + "set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None, + Const.STACK: lambda: ValueError( + "set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None, + Const.ACL: lambda: ValueError( + "set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len( + scope) != 1 else None, + Const.API_LIST: lambda: ValueError( + "Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len( + api_list) < 1 else None, Const.API_STACK: lambda: None, } if mode not in Const.DUMP_MODE: @@ -291,7 +300,8 @@ def check_dump_mode_valid(dump_mode): print_warn_log("Please set dump_mode as a list.") dump_mode = [dump_mode] if not all(mode in ["all", "forward", "backward", "input", "output"] for mode in dump_mode): - raise ValueError("Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.") + raise ValueError( + "Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.") if 'input' not in dump_mode and 'output' not in dump_mode: dump_mode.extend(['input', 'output']) if 'forward' not in dump_mode and 'backward' not in dump_mode: @@ -531,11 +541,13 @@ def format_value(value): def torch_device_guard(func): if is_gpu or torch_without_guard_version: return func + # Parse args/kwargs matched torch.device objects @torch_npu_device_guard def wrapper(*args, **kwargs): return func(*args, **kwargs) + return wrapper @@ -579,16 +591,15 @@ def get_process_rank(model): return 0, False if device.type == 'cpu': print_warn_log("Warning: the debugger is unable to get the rank id. " - "This may cause the dumpped data to be corrupted in the " - "case of distributed training. (You may ignore this if you are using only one card.) " - "Transfer the model to npu or gpu before register_hook() to avoid this warning.") + "This may cause the dumpped data to be corrupted in the " + "case of distributed training. (You may ignore this if you are using only one card.) " + "Transfer the model to npu or gpu before register_hook() to avoid this warning.") return 0, False else: return device.index, True def parameter_adapter(func): - @wraps(func) def inner(self, *args, **kwargs): if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor): @@ -612,6 +623,7 @@ def parameter_adapter(func): res = [input[tensor_index] for tensor_index in indices] return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0) return func(self, *args, **kwargs) + return inner @@ -623,7 +635,7 @@ def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode): try: with FileOpen(template_path, 'r') as ftemp, \ - os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout: + os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout: code_temp = ftemp.read() fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack)) except OSError: @@ -669,3 +681,9 @@ def check_path_before_create(path): if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): print_error_log('The file path {} contains special characters.'.format(path)) raise CompareException(CompareException.INVALID_PATH_ERROR) + + +def get_md5_for_tensor(x): + tensor_bytes = x.cpu().detach().float().numpy().tobytes() + md5_hash = hashlib.md5(tensor_bytes) + return md5_hash.hexdigest() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index dcf2a8b2da6e3dbe6e3a5ca3d917f498d56cc27b..1ef224e0f3c9e914d05330770cd90927d9aa757a 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -32,7 +32,7 @@ else: is_gpu = False from .utils import DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist -from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path +from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path, get_md5_for_tensor from ..dump.utils import check_writable from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_vaild, check_path_length @@ -69,7 +69,8 @@ def get_not_float_tensor_info(data): tensor_max = torch._C._VariableFunctionsClass.max(data).cpu().detach().float().numpy().tolist() tensor_min = torch._C._VariableFunctionsClass.min(data).cpu().detach().float().numpy().tolist() tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).cpu().detach().float().numpy().tolist() - return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean) + data_md5 = get_md5_for_tensor(data) + return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, data_md5) def get_scalar_data_info(data): @@ -81,17 +82,18 @@ def get_float_tensor_info(data): tensor_max = torch._C._VariableFunctionsClass.max(data).cpu().detach().float().numpy().tolist() tensor_min = torch._C._VariableFunctionsClass.min(data).cpu().detach().float().numpy().tolist() tensor_mean = torch._C._VariableFunctionsClass.mean(data).cpu().detach().float().numpy().tolist() - return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean) + data_md5 = get_md5_for_tensor(data) + return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, data_md5) -def get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean): +def get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, data_md5): summary_data = [] saved_tensor = data.contiguous().cpu().detach() if data.dtype == torch.bfloat16: saved_numpy = saved_tensor.to(torch.float32).numpy() else: saved_numpy = saved_tensor.numpy() - summary_data.extend([tensor_max, tensor_min, tensor_mean]) + summary_data.extend([tensor_max, tensor_min, tensor_mean, data_md5]) return DataInfo(data, saved_numpy, summary_data, str(data.dtype), tuple(data.shape))