From 2afcc2a85fb4a26dfb3551b00cf951db0d8aa039 Mon Sep 17 00:00:00 2001 From: s30048155 Date: Wed, 1 Nov 2023 11:22:25 +0800 Subject: [PATCH 1/4] clearcode --- .../api_accuracy_checker/common/base_api.py | 6 +++--- .../api_accuracy_checker/common/config.py | 3 ++- .../api_accuracy_checker/common/utils.py | 17 +++++++++++++---- .../api_accuracy_checker/compare/algorithm.py | 15 ++++++++++++++- .../api_accuracy_checker/compare/compare.py | 9 +++++---- .../api_accuracy_checker/dump/api_info.py | 3 ++- .../api_accuracy_checker/dump/dump.py | 2 ++ .../api_accuracy_checker/dump/dump_scope.py | 1 + .../api_accuracy_checker/dump/info_dump.py | 4 +++- .../api_accuracy_checker/run_ut/run_ut.py | 4 +++- 10 files changed, 48 insertions(+), 16 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/base_api.py b/debug/accuracy_tools/api_accuracy_checker/common/base_api.py index 2c3086184c..64ce9e7170 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/base_api.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/base_api.py @@ -48,8 +48,8 @@ class BaseAPIInfo: single_arg.update({'type' : 'torch.Tensor'}) single_arg.update({'dtype' : str(arg.dtype)}) single_arg.update({'shape' : arg.shape}) - single_arg.update({'Max' : self.transfer_types(self.get_tensor_extremum(arg,'max'), str(arg.dtype))}) - single_arg.update({'Min' : self.transfer_types(self.get_tensor_extremum(arg,'min'), str(arg.dtype))}) + single_arg.update({'Max' : self.transfer_types(self.get_tensor_extremum(arg, 'max'), str(arg.dtype))}) + single_arg.update({'Min' : self.transfer_types(self.get_tensor_extremum(arg, 'min'), str(arg.dtype))}) single_arg.update({'requires_grad': arg.requires_grad}) else: @@ -87,7 +87,7 @@ class BaseAPIInfo: return float(data) def is_builtin_class(self, element): - if element is None or isinstance(element, (bool,int,float,str,slice)): + if element is None or isinstance(element, (bool, int, float, str, slice)): return True return False diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py index c47911e213..36df4bb014 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py @@ -1,8 +1,9 @@ -import yaml import os +import yaml from api_accuracy_checker.common.utils import check_file_or_directory_path from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen + class Config: def __init__(self, yaml_file): check_file_or_directory_path(yaml_file, False) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index b031b92a18..8c0cceebed 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -29,9 +29,6 @@ import numpy as np import torch import csv -from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileCheckConst, FileChecker, FileOpen -from ptdbg_ascend.src.python.ptdbg_ascend.common import file_check_util - try: import torch_npu except ImportError: @@ -39,6 +36,9 @@ except ImportError: else: IS_GPU = False +from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileCheckConst, FileChecker, FileOpen +from ptdbg_ascend.src.python.ptdbg_ascend.common import file_check_util + torch_without_guard_version_list = ['2.1'] for version in torch_without_guard_version_list: if torch.__version__.startswith(version): @@ -65,7 +65,7 @@ class Const: DOT = "." DUMP_RATIO_MAX = 100 SUMMERY_DATA_NUMS = 256 - ONE_HUNDRED_MB = 100*1024*1024 + ONE_HUNDRED_MB = 100 * 1024 * 1024 FLOAT_EPSILON = np.finfo(float).eps SUPPORT_DUMP_MODE = ['api', 'acl'] ON = 'ON' @@ -103,6 +103,7 @@ class Const: "int32_to_int64": ["cross_entropy"] } + class CompareConst: """ Class for compare module const @@ -191,19 +192,23 @@ class CompareException(Exception): def __str__(self): return self.error_info + class DumpException(CompareException): pass + def read_json(file): with FileOpen(file, 'r') as f: obj = json.load(f) return obj + def write_csv(data, filepath): with FileOpen(filepath, 'a') as f: writer = csv.writer(f) writer.writerows(data) + def _print_log(level, msg): current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) pid = os.getgid() @@ -296,6 +301,7 @@ def check_file_or_directory_path(path, isdir=False): 'The path {} does not have permission to read. Please check the path permission'.format(path)) raise CompareException(CompareException.INVALID_PATH_ERROR) + def _check_pkl(pkl_file_handle, file_name): tensor_line = pkl_file_handle.readline() if len(tensor_line) == 0: @@ -573,6 +579,7 @@ def check_need_convert(api_name): convert_type = key return convert_type + def api_info_preprocess(api_name, api_info_dict): """ Function Description: @@ -589,6 +596,7 @@ def api_info_preprocess(api_name, api_info_dict): api_info_dict = cross_entropy_process(api_info_dict) return convert_type, api_info_dict + def cross_entropy_process(api_info_dict): """ Function Description: @@ -603,6 +611,7 @@ def cross_entropy_process(api_info_dict): api_info_dict['args'][1]['Min'] = 0 #The second argument in cross_entropy should be -100 or not less than 0. return api_info_dict + def initialize_save_path(save_path, dir_name): data_path = os.path.join(save_path, dir_name) if os.path.exists(data_path): diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index cca521a296..a8792225d0 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -1,5 +1,4 @@ # 定义比对算法及比对标准 - import torch import numpy as np from api_accuracy_checker.compare.compare_utils import CompareConst, check_dtype_comparable @@ -14,6 +13,7 @@ def compare_torch_tensor(cpu_output, npu_output, compare_alg): return compare_bool_tensor(cpu_output, npu_output) return compare_alg(cpu_output, npu_output) + def compare_bool_tensor(cpu_output, npu_output): cpu_shape = cpu_output.shape npu_shape = npu_output.shape @@ -23,6 +23,7 @@ def compare_bool_tensor(cpu_output, npu_output): error_rate = float(error_nums / cpu_output.size) return error_rate, error_rate == 0, "" + def get_msg_and_handle_value(b_value, n_value): msg = "" if not isinstance(b_value, np.ndarray) or not isinstance(n_value, np.ndarray): @@ -46,6 +47,7 @@ def get_msg_and_handle_value(b_value, n_value): b_value[zero_mask] += np.finfo(float).eps return b_value, n_value, msg + def get_max_rel_err(b_value, n_value): b_value, n_value, msg = get_msg_and_handle_value(b_value, n_value) rel_err = np.abs((n_value - b_value) / b_value).max() @@ -55,15 +57,18 @@ def get_max_rel_err(b_value, n_value): bool_result = rel_err < 0.001 return rel_err, bool_result, msg + def get_max_abs_err(b_value, n_value): b_value, n_value, msg = get_msg_and_handle_value(b_value, n_value) abs_err = np.abs(b_value - n_value).max() bool_result = abs_err < 0.001 return abs_err, bool_result, msg + def get_rel_err_ratio_thousandth(b_value, n_value): return get_rel_err_ratio(b_value, n_value, 0.001) + def get_rel_err_ratio_ten_thousandth(b_value, n_value): ratio, bool_result, msg = get_rel_err_ratio(b_value, n_value, 0.0001) if n_value.dtype == np.float16: @@ -71,6 +76,7 @@ def get_rel_err_ratio_ten_thousandth(b_value, n_value): return ratio, True, msg return ratio, bool_result, msg + def get_rel_err_ratio(b_value, n_value, thresholding): b_value, n_value, msg = get_msg_and_handle_value(b_value, n_value) rel_errs = np.abs((n_value - b_value) / b_value) @@ -78,14 +84,17 @@ def get_rel_err_ratio(b_value, n_value, thresholding): bool_result = ratio > (1 - thresholding) return ratio, bool_result, msg + def max_rel_err_standard(max_rel_errs): bool_result = np.array(max_rel_errs) < 0.001 return np.all(bool_result), bool_result + def cosine_standard(compare_result): bool_result = np.array(compare_result) > 0.99 return np.all(bool_result), bool_result + def cosine_sim(cpu_output, npu_output): msg = "" n_value = npu_output.reshape(-1) @@ -116,12 +125,14 @@ def cosine_sim(cpu_output, npu_output): msg = "Dump data has NaN when comparing with Cosine Similarity." return cos, cos > 0.99, msg + def compare_uint8_data(b_value, n_value): if (b_value == n_value).all(): return 1, True else: return 0, False + def compare_builtin_type(bench_out, npu_out): if not isinstance(bench_out, (bool, int, float, str)): return CompareConst.NA, True, "" @@ -129,6 +140,7 @@ def compare_builtin_type(bench_out, npu_out): return CompareConst.NAN, False, "" return True, True, "" + def flatten_compare_result(result): flatten_result = [] for result_i in result: @@ -138,6 +150,7 @@ def flatten_compare_result(result): flatten_result.append(result_i) return flatten_result + # 本函数用alg比对bench_out 和npu_out,返回详细比对结果compare_result和标志比对是否通过的布尔变量test_success def compare_core(bench_out, npu_out, alg): msg = "" diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index f3e8a4cf49..b33626321f 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -8,6 +8,7 @@ from api_accuracy_checker.common.utils import get_json_contents, print_info_log, from api_accuracy_checker.compare.compare_utils import CompareConst from api_accuracy_checker.common.config import msCheckerConfig + class Comparator: TEST_FILE_NAME = "accuracy_checking_result.csv" DETAIL_TEST_FILE_NAME = "accuracy_checking_details.csv" @@ -174,14 +175,14 @@ class Comparator: if name == "Max Absolute Error": max_abs_error_success = test_success if detailed_result_total: - for i in range(len(detailed_result_total)): - detailed_result_total[i] += detailed_result[i] + for i, detailed_result_item in enumerate(detailed_result): + detailed_result_total[i] += detailed_result_item else: detailed_result_total = detailed_result test_success_total = test_success_total or max_abs_error_success # dtype加到所有指标的前面, 是否pass放到所有指标的后面 - for i in range(len(detailed_result_total)): - detailed_result = list(detailed_result_total[i]) + for i, detailed_result in enumerate(detailed_result_total): + detailed_result = list(detailed_result) detailed_result.insert(0, bench_dtype_total[i]) detailed_result.insert(1, npu_dtype_total[i]) detailed_result.insert(2, shape_total[i]) diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py b/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py index 2a86699d83..ca0d4021a5 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py @@ -25,7 +25,8 @@ class ForwardAPIInfo(APIInfo): def analyze_api_call_stack(self): stack_str = [] for (_, path, line, func, code, _) in inspect.stack()[3:]: - if not code: continue + if not code: + continue stack_line = " ".join([ "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]), " ".join(["\n", code[0].strip()])])]) diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py index 2a69e226cd..0120c10ddc 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py @@ -31,6 +31,7 @@ def set_dump_switch(switch): initialize_output_json() DumpUtil.set_dump_switch(switch) + class DumpUtil(object): dump_switch = None call_num = 0 @@ -74,6 +75,7 @@ def pretest_info_dump(name, out_feat, module, phase): write_api_info_json(api_info) + def pretest_hook(name, phase): def pretest_info_dump_hook(module, in_feat, out_feat): pretest_info_dump(name, out_feat, module, phase) diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py b/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py index 17f94da193..85f555ed75 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py @@ -4,6 +4,7 @@ from torch.utils.data.dataloader import _BaseDataLoaderIter from api_accuracy_checker.dump.dump import DumpUtil from api_accuracy_checker.common.config import msCheckerConfig + def iter_tracer(func): def func_wrapper(*args, **kwargs): DumpUtil.dump_switch = "OFF" diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py b/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py index 7eeeeb590d..05354226f3 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/info_dump.py @@ -11,6 +11,7 @@ from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen lock = threading.Lock() + def write_api_info_json(api_info): dump_path = msCheckerConfig.dump_path rank = api_info.rank @@ -26,8 +27,9 @@ def write_api_info_json(api_info): else: raise ValueError(f"Invalid api_info type {type(api_info)}") + def write_json(file_path, data, indent=None): - check_file_or_directory_path(os.path.dirname(file_path),True) + check_file_or_directory_path(os.path.dirname(file_path), True) if not os.path.exists(file_path): with FileOpen(file_path, 'w') as f: f.write("{\n}") diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 536d607dd6..5f9e27216c 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -67,8 +67,10 @@ def generate_npu_params(input_args, input_kwargs, need_backward): npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in input_kwargs.items()} return npu_args, npu_kwargs + def generate_cpu_params(input_args, input_kwargs, need_backward): first_dtype = None + def recursive_arg_to_cpu(arg_in): nonlocal first_dtype if isinstance(arg_in, (list, tuple)): @@ -99,6 +101,7 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): cpu_kwargs = {key: recursive_arg_to_cpu(value) for key, value in input_kwargs.items()} return cpu_args, cpu_kwargs + def run_ut(forward_file, backward_file, out_path, save_error_data): print_info_log("start UT test") forward_content = get_json_contents(forward_file) @@ -140,7 +143,6 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) UtAPIInfo(api_full_name + '.backward.output.npu', data_info.npu_grad_out) - def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): in_fwd_data_list = [] [api_type, api_name, _] = api_full_name.split("*") -- Gitee From 9abe240fe4882955907f19d8052533fde8518f6b Mon Sep 17 00:00:00 2001 From: s30048155 Date: Wed, 1 Nov 2023 11:55:36 +0800 Subject: [PATCH 2/4] clearcode --- .../src/python/ptdbg_ascend/__init__.py | 1 + .../src/python/ptdbg_ascend/common/log.py | 1 + .../src/python/ptdbg_ascend/common/utils.py | 3 ++ .../compare/distributed_compare.py | 3 +- .../debugger/precision_debugger.py | 1 + .../src/python/ptdbg_ascend/dump/dump.py | 10 ++-- .../src/python/ptdbg_ascend/dump/utils.py | 4 +- .../ptdbg_ascend/hook_module/hook_module.py | 2 + .../hook_module/wrap_npu_custom.py | 1 + .../ptdbg_ascend/online_dispatch/dispatch.py | 6 ++- .../online_dispatch/dump_compare.py | 4 +- .../ptdbg_ascend/overflow_check/info_dump.py | 7 +-- .../overflow_check/overflow_check.py | 17 ++++--- .../parse_tool/lib/parse_exception.py | 4 +- .../ptdbg_ascend/src/python/setup.py | 49 ++++++++++--------- 15 files changed, 69 insertions(+), 44 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py index 920deafde9..4c3228c478 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py @@ -30,6 +30,7 @@ from .common.utils import seed_all, torch_without_guard_version, print_info_log from .debugger.precision_debugger import PrecisionDebugger seed_all() + def jit_script(obj, optimize=None, _frames_up=0, _rcb=None, example_input=None): print_info_log("The torch_npu earlier than 2.1 does not support torch.jit.script. " "Therefore, to ensure that the dump data of the GPU and NPU is consistent, " diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py index a7b419866d..32c3423551 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py @@ -2,6 +2,7 @@ import os import time import sys + def _print_log(level, msg, end='\n'): current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) pid = os.getgid() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index 30fe40a95b..dfdaa0fc00 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -281,6 +281,7 @@ def check_switch_valid(switch): print_error_log("Please set switch with 'ON' or 'OFF'.") raise CompareException(CompareException.INVALID_PARAM_ERROR) + def check_dump_mode_valid(dump_mode): if not isinstance(dump_mode, list): print_warn_log("Please set dump_mode as a list.") @@ -295,12 +296,14 @@ def check_dump_mode_valid(dump_mode): return ["forward", "backward", "input", "output"] return dump_mode + def check_summary_only_valid(summary_only): if not isinstance(summary_only, bool): print_error_log("Params summary_only only support True or False.") raise CompareException(CompareException.INVALID_PARAM_ERROR) return summary_only + def check_compare_param(input_parma, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False): # 添加默认值来让不传参时能通过参数检查 if not (isinstance(input_parma, dict) and isinstance(output_path, str) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py index d92b0a145b..c58db903dc 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -import os, sys +import os +import sys import re from ..common.utils import print_error_log, CompareException, check_compare_param from .acc_compare import compare_core diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py index 79066d6ce3..b2cfdaba44 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py @@ -108,6 +108,7 @@ class PrecisionDebugger: PrecisionDebugger.step() PrecisionDebugger.start() + def iter_tracer(func): def func_wrapper(*args, **kwargs): PrecisionDebugger.stop() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index dcf2a8b2da..5238754cce 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -18,12 +18,12 @@ import inspect import json import os -import numpy as np -import torch import threading - from pathlib import Path +import numpy as np +import torch + try: import torch_npu except ImportError: @@ -47,6 +47,7 @@ pkl_name = "" rank = os.getpid() multi_output_apis = ["_sort_", "npu_flash_attention"] + class DataInfo(object): def __init__(self, data, save_data, summary_data, dtype, shape): self.data = data @@ -183,6 +184,7 @@ def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file): if 'output' in DumpUtil.dump_mode: dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file) + def rename_(): global rank global pkl_name @@ -198,6 +200,7 @@ def rename_(): os.rename(dir_name, new_name) pkl_name = os.path.join(new_name, file_name) + def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): dump_file = DumpUtil.get_dump_path() dump_file = modify_dump_path(dump_file, DumpUtil.dump_switch_mode) @@ -356,5 +359,6 @@ def write_to_disk(): change_mode(pkl_name, FileCheckConst.DATA_FILE_AUTHORITY) api_list = [] + def get_pkl_file_path(): return pkl_name diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index 35f5ad4a90..4a8a03620e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -73,10 +73,12 @@ class DumpUtil(object): def check_list_or_acl_mode(name_prefix): global dump_count + result = False for item in DumpUtil.dump_switch_scope: if name_prefix.startswith(item): dump_count = dump_count + 1 - return True + result = True + return result def check_range_mode(name_prefix): global range_begin_flag diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py index a3cb10bf4f..83f7dcacef 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py @@ -24,8 +24,10 @@ import torch.utils.hooks as full_hooks g_stop_hook = False + class HOOKModule(nn.Module): module_count = {} + def __init__(self, hook) -> None: super(HOOKModule, self).__init__() self.has_overflow = False diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py index a507805f0b..f2e1e8f9d3 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py @@ -48,6 +48,7 @@ class NpuOPTemplate(HOOKModule): else: return getattr(torch_npu._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs) + def wrap_npu_op(op_name, hook): def npu_op_template(*args, **kwargs): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py index 53f104d45e..1b16502fb7 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py @@ -1,10 +1,12 @@ import os import time -import yaml -import json from pathlib import Path from multiprocessing import Manager, Pool + +import yaml +import json import torch + from torch.utils._python_dispatch import TorchDispatchMode try: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py index a4740fef88..2f0818ffe8 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py @@ -44,11 +44,11 @@ class TimeStatistics: def __enter__(self): if self.debug: - self.time = datetime.now() + self.time = datetime.now().astimezone() def __exit__(self, exc_type, exc_val, exc_tb): if self.debug: - cost_time = datetime.now() - self.time + cost_time = datetime.now().astimezone() - self.time time_cost = f'Time[{self.tag}]: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ f'Id[{self.index}], time[{cost_time}]' hot_time_cost = "Hotspot " + time_cost diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py index 22040a62aa..204833161c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py @@ -1,11 +1,11 @@ import inspect import fcntl -import json import os -import torch import threading +import json import numpy as np +import torch from ..common.utils import print_error_log, get_time from ..common.file_check_util import FileOpen @@ -170,7 +170,8 @@ class ForwardAPIInfo(APIInfo): def analyze_api_call_stack(self): stack_str = [] for (_, path, line, func, code, _) in inspect.stack()[3:]: - if not code: continue + if not code: + continue stack_line = " ".join([ "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]), " ".join(["\n", code[0].strip()])])]) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py index f389081807..d1e9c9637e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py @@ -1,12 +1,7 @@ import os -import torch from pathlib import Path -from ..common.utils import print_warn_log, get_time, print_info_log -from ..dump.dump import forward_init_status, forward_acl_dump -from .utils import OverFlowUtil, dump_overflow -from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist -from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo -from ..dump import dump + +import torch try: import torch_npu @@ -15,6 +10,13 @@ except ImportError: else: is_gpu = False +from ..common.utils import print_warn_log, get_time, print_info_log +from ..dump.dump import forward_init_status, forward_acl_dump +from .utils import OverFlowUtil, dump_overflow +from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist +from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo +from ..dump import dump + backward_init_status = False api_overflow = [] forward_api_info = {} @@ -75,6 +77,7 @@ def check_data_overflow(x): def check_path(apis, path): return any(api in path for api in apis) + def overflow_check(name, **kwargs): overflow_nums = OverFlowUtil.overflow_nums pid = kwargs.get('pid') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py index a82a5106ff..380d84cb2c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py @@ -39,12 +39,14 @@ def catch_exception(func): def inner(*args, **kwargs): log = logging.getLogger() line = args[-1] if len(args) == 2 else "" + result = None try: - return func(*args, **kwargs) + result = func(*args, **kwargs) except OSError: log.error("%s: command not found" % line) except ParseException: log.error("Command execution failed") except SystemExit: log.warning("Please enter the correct command") + return result return inner diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py index fecc868429..02755e8e1f 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py @@ -15,36 +15,37 @@ # limitations under the License. """ -import setuptools -from pathlib import Path -import stat import os +import stat +from pathlib import Path +import setuptools VERSION = '3.0' + def generate_ptdbg_ascend_version(): - ptdbg_ascend_root = Path(__file__).parent - version_path = ptdbg_ascend_root / "ptdbg_ascend" / "common" / "version.py" - if version_path.exists(): - version_path.unlink() - flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL - modes = stat.S_IWUSR | stat.S_IRUSR - with os.fdopen(os.open(version_path, flags, modes), 'w') as f: - f.write("__version__ = '{version}'\n".format(version = VERSION)) + ptdbg_ascend_root = Path(__file__).parent + version_path = ptdbg_ascend_root / "ptdbg_ascend" / "common" / "version.py" + if version_path.exists(): + version_path.unlink() + flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL + modes = stat.S_IWUSR | stat.S_IRUSR + with os.fdopen(os.open(version_path, flags, modes), 'w') as f: + f.write("__version__ = '{version}'\n".format(version = VERSION)) generate_ptdbg_ascend_version() setuptools.setup(name='ptdbg_ascend', - version=VERSION, - description='This is a pytorch precision comparison tools', - long_description='This is a pytorch precision comparison tools, include overflow detect tool', - packages=setuptools.find_packages(), - install_requires = [ - "wheel", - "numpy", - "pandas >= 1.3.5", - "pyyaml" - ], - include_package_data=True, - ext_modules=[], - zip_safe=False) + version=VERSION, + description='This is a pytorch precision comparison tools', + long_description='This is a pytorch precision comparison tools, include overflow detect tool', + packages=setuptools.find_packages(), + install_requires = [ + "wheel", + "numpy", + "pandas >= 1.3.5", + "pyyaml" + ], + include_package_data=True, + ext_modules=[], + zip_safe=False) -- Gitee From aa1a4c63e11187201aabe8f518b644c28fb94327 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Wed, 1 Nov 2023 06:41:20 +0000 Subject: [PATCH 3/4] Revert "Merge branch 'master' of gitee.com:ascend/att into master" This reverts commit 9a2239226f9cfe1379bdba5605a284f93b40cea0. --- ...0\203\275\350\257\264\346\230\216_v4.1.md" | 10 +- .../src/python/ptdbg_ascend/common/utils.py | 20 ++-- .../src/python/ptdbg_ascend/dump/utils.py | 102 ++++++++---------- .../online_dispatch/dump_compare.py | 4 +- .../ptdbg_ascend/overflow_check/info_dump.py | 17 ++- .../overflow_check/overflow_check.py | 4 +- .../parse_tool/lib/visualization.py | 4 +- .../ptdbg_ascend/test/ut/test_hooks.py | 2 +- 8 files changed, 69 insertions(+), 94 deletions(-) diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" index 393b6c241d..50220ae2fc 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" @@ -1208,14 +1208,10 @@ dump结果目录结构示例如下: **溢出检测dump场景** -register_hook设置了overflow_check时,检测API溢出,dump结果的文件名形式为{api_type}_{api_name}_{api调用次数}_{正反向}_{当前溢出次数},dump结果如下: +register_hook设置了overflow_check时,检测API溢出,dump结果的文件名固定为Overflow_info_{timestamp},dump结果如下: -* {api_type}_{api_name}_{api调用次数}_{正反向}_{当前溢出次数}.pkl -* {api_type}_{api_name}_{api调用次数}_{正反向}_{当前溢出次数}目录 - -例如: -* Tensor___add___1_forward_1.pkl -* Tensor___add___1_forward_1目录 +* Overflow_info_{timestamp}.pkl +* Overflow_info_{timestamp}目录 ## CPU或GPU与NPU精度数据比对 diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index f2361a90b7..dfdaa0fc00 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -210,7 +210,7 @@ def make_dump_path_if_not_exists(dump_path): except OSError as ex: print_error_log( 'Failed to create {}.Please check the path permission or disk space .{}'.format(dump_path, str(ex))) - raise CompareException(CompareException.INVALID_PATH_ERROR) from ex + raise CompareException(CompareException.INVALID_PATH_ERROR) else: if not os.path.isdir(dump_path): print_error_log('{} already exists and is not a directory.'.format(dump_path)) @@ -253,11 +253,7 @@ def print_warn_log(warn_msg): _print_log("WARNING", warn_msg) -def check_mode_valid(mode, scope=None, api_list=None): - if scope is None: - scope = [] - if api_list is None: - api_list = [] +def check_mode_valid(mode, scope=[], api_list=[]): if not isinstance(scope, list): raise ValueError("scope param set invalid, it's must be a list.") if not isinstance(api_list, list): @@ -276,8 +272,8 @@ def check_mode_valid(mode, scope=None, api_list=None): (mode, Const.DUMP_MODE) raise CompareException(CompareException.INVALID_DUMP_MODE, msg) - if mode_check.get(mode)() is not None: - raise mode_check.get(mode)() + if mode_check[mode]() is not None: + raise mode_check[mode]() def check_switch_valid(switch): @@ -355,8 +351,8 @@ def _check_pkl(pkl_file_handle, file_name): pkl_file_handle.seek(0, 0) -def is_starts_with(string, prefix_list): - return any(string.startswith(prefix) for prefix in prefix_list) +def is_starts_with(string, prefixes): + return any(string.startswith(prefix) for prefix in prefixes) def check_file_mode(npu_pkl, bench_pkl, stack_mode): @@ -401,9 +397,9 @@ def remove_path(path): os.remove(path) else: shutil.rmtree(path) - except PermissionError as err: + except PermissionError: print_error_log("Failed to delete {}. Please check the permission.".format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) from err + raise CompareException(CompareException.INVALID_PATH_ERROR) def get_dump_data_path(dump_dir): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index a7cdedb235..4a8a03620e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -17,41 +17,6 @@ dump_count = 0 range_begin_flag, range_end_flag = False, False -def check_list_or_acl_mode(name_prefix): - global dump_count - for item in DumpUtil.dump_switch_scope: - if name_prefix.startswith(item): - dump_count = dump_count + 1 - return True - - -def check_range_mode(name_prefix): - global range_begin_flag - global range_end_flag - if name_prefix.startswith(DumpUtil.dump_switch_scope[0]): - range_begin_flag = True - return True - if name_prefix.startswith(DumpUtil.dump_switch_scope[1]): - range_end_flag = True - return True - if range_begin_flag and not range_end_flag: - return True - return False - - -def check_stack_mode(name_prefix): - if len(DumpUtil.dump_switch_scope) == 0: - return True - elif len(DumpUtil.dump_switch_scope) == 1: - return name_prefix.startswith(DumpUtil.dump_switch_scope[0]) - elif len(DumpUtil.dump_switch_scope) == 2: - return check_range_mode(name_prefix) - else: - print_error_log("dump scope is invalid, Please set the scope mode in" - " set_dump_switch with 'all', 'list', 'range', 'stack', 'acl', 'api_list'!") - return False - - class DumpUtil(object): dump_root = None dump_data_dir = None @@ -106,6 +71,40 @@ class DumpUtil(object): DumpUtil.dump_switch_scope = [api_name.replace("backward", "forward") for api_name in scope] DumpUtil.summary_only = summary_only + def check_list_or_acl_mode(name_prefix): + global dump_count + result = False + for item in DumpUtil.dump_switch_scope: + if name_prefix.startswith(item): + dump_count = dump_count + 1 + result = True + return result + + def check_range_mode(name_prefix): + global range_begin_flag + global range_end_flag + if name_prefix.startswith(DumpUtil.dump_switch_scope[0]): + range_begin_flag = True + return True + if name_prefix.startswith(DumpUtil.dump_switch_scope[1]): + range_end_flag = True + return True + if range_begin_flag and not range_end_flag: + return True + return False + + def check_stack_mode(name_prefix): + if len(DumpUtil.dump_switch_scope) == 0: + return True + elif len(DumpUtil.dump_switch_scope) == 1: + return name_prefix.startswith(DumpUtil.dump_switch_scope[0]) + elif len(DumpUtil.dump_switch_scope) == 2: + return DumpUtil.check_range_mode(name_prefix) + else: + print_error_log("dump scope is invalid, Please set the scope mode in" + " set_dump_switch with 'all', 'list', 'range', 'stack', 'acl', 'api_list'!") + return False + check_mapper = { Const.LIST: check_list_or_acl_mode, Const.ACL: check_list_or_acl_mode, @@ -199,14 +198,7 @@ def generate_dump_path_str(): return dump_path -def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.OFF, dump_mode=None, - summary_only=False): - if scope is None: - scope = [] - if api_list is None: - api_list = [] - if dump_mode is None: - dump_mode = [Const.ALL] +def set_dump_switch(switch, mode=Const.ALL, scope=[], api_list=[], filter_switch=Const.ON, dump_mode=[Const.ALL], summary_only=False): check_switch_valid(switch) if not DumpUtil.dump_path: set_dump_path() @@ -217,18 +209,10 @@ def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, filter_sw if check_is_npu() and DumpUtil.dump_switch_mode in [Const.ALL, Const.API_STACK, Const.LIST, Const.RANGE]: generate_compare_script(DumpUtil.dump_data_dir, dump.get_pkl_file_path(), DumpUtil.dump_switch_mode) set_dump_switch_print_info(switch, mode, dump_path_str) - set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=dump_mode, - summary_only=summary_only) - - -def set_dump_switch_config(mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.OFF, dump_mode=None, - summary_only=False): - if scope is None: - scope = [] - if api_list is None: - api_list = [] - if dump_mode is None: - dump_mode = [Const.ALL] + set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=dump_mode,summary_only=summary_only) + + +def set_dump_switch_config(mode=Const.ALL, scope=[], api_list=[], filter_switch=Const.ON, dump_mode=[Const.ALL], summary_only=False): try: check_mode_valid(mode, scope, api_list) check_switch_valid(filter_switch) @@ -236,10 +220,10 @@ def set_dump_switch_config(mode=Const.ALL, scope=None, api_list=None, filter_swi summary_only = check_summary_only_valid(summary_only) except (CompareException, AssertionError) as err: print_error_log(str(err)) - raise CompareException(CompareException.INVALID_PARAM_ERROR) from err + raise CompareException(CompareException.INVALID_PARAM_ERROR) switch = DumpUtil.dump_switch DumpUtil.set_dump_switch("OFF", mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, - dump_mode=dump_mode, summary_only=summary_only) + dump_mode=dump_mode, summary_only=summary_only) DumpUtil.dump_switch = switch @@ -308,9 +292,9 @@ def load_env_dump_path(dump_path): if dump_path: try: dump_path = os.path.join(str(dump_path), Const.DUMP_DIR) - except TypeError as err: + except TypeError: print_error_log("Generating dump path from environment variables ASCEND_WORK_PATH failed.") - raise DumpException(DumpException.INVALID_PATH_ERROR) from err + raise DumpException(DumpException.INVALID_PATH_ERROR) else: print_error_log("Dump path is None, you can configure it in the following ways:\n" "1. Configure set_dump_path function.\n" diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py index 902098b957..2f0818ffe8 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py @@ -140,8 +140,8 @@ def save_summery(run_param, npu_data, cpu_data, prefix, summery_list, compute_fl data_dict[CompareConst.COSINE], data_dict[CompareConst.MAX_ABS_ERR], data_dict[CompareConst.MAX_RELATIVE_ERR], \ data_dict[CompareConst.ERROR_MESSAGE] = get_compare_result(npu_data, cpu_data) - data_dict[CompareConst.ACCURACY] = check_accuracy(data_dict.get(CompareConst.COSINE), - data_dict.get(CompareConst.MAX_ABS_ERR)) + data_dict[CompareConst.ACCURACY] = check_accuracy(data_dict[CompareConst.COSINE], + data_dict[CompareConst.MAX_ABS_ERR]) else: data_dict[CompareConst.COSINE] = 1 data_dict[CompareConst.MAX_ABS_ERR] = 0 diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py index 5a4b0bf3be..204833161c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py @@ -42,29 +42,28 @@ class APIInfo: out = [] for item in element: out.append(self.analyze_element(item)) - return out elif isinstance(element, dict): - out_dict = {} + out = {} for key, value in element.items(): if key in self.torch_object_key.keys(): fun = self.torch_object_key[key] - out_dict[key] = fun(value) + out[key] = fun(value) elif key in special_torch_object: continue else: - out_dict[key] = self.analyze_element(value) - return out_dict + out[key] = self.analyze_element(value) + elif isinstance(element, torch.Tensor): - out_tensor = self.analyze_tensor(element, self.save_real_data) - return out_tensor + out = self.analyze_tensor(element, self.save_real_data) + elif self.is_builtin_class(element): - out_builtin = self.analyze_builtin(element) - return out_builtin + out = self.analyze_builtin(element) else: msg = f"Type {type(element)} is unsupported at analyze_element" print_error_log(msg) raise NotImplementedError(msg) + return out def analyze_tensor(self, arg, save_real_data): single_arg = {} diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py index dc43c2c3be..d1e9c9637e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py @@ -24,7 +24,6 @@ backward_api_info = {} FORWARD_REAL_DATA_PATH = os.path.join('./', 'forward_real_data') BACKWARD_REAL_DATA_PATH = os.path.join('./', 'backward_real_data') rank = os.getpid() -pkl_name = '' def check_overflow_environment(pid): @@ -138,7 +137,7 @@ def overflow_check(name, **kwargs): backward_api_info.update({name: BackwardAPIInfo(name, out_feat)}) OverFlowUtil.inc_overflow_dump_times() dump_file_name = os.path.join(dump_dir, - "{}_{}.pkl".format(module_name, OverFlowUtil.real_overflow_dump_times)) + "Overflow_info_{}_{}.pkl".format(get_time(), OverFlowUtil.real_overflow_dump_times)) dump_overflow(module_name, in_feat, out_feat, dump_file_name) dump.pkl_name = dump_file_name @@ -157,6 +156,7 @@ def overflow_check(name, **kwargs): write_api_info_json(backward_api_info[key]) raise ValueError("[overflow {} times]: dump file is saved in '{}'." .format(OverFlowUtil.real_overflow_dump_times, os.path.realpath(dump_file_name))) + return def overflow_type_judge(in_feat, out_feat, module_name): if module_name.endswith(Const.BACKWARD): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py index 10dde7e089..a46f719666 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py @@ -32,7 +32,7 @@ class Visualization: except UnicodeError as e: self.util.log.error("%s %s" % ("UnicodeError", str(e))) self.util.log.warning("Please check the npy file") - raise ParseException(ParseException.PARSE_UNICODE_ERROR) from e + raise ParseException(ParseException.PARSE_UNICODE_ERROR) table = self.util.create_table('', ['Index', 'Data']) flatten_data = np_data.flatten() for i in range(min(16, int(np.ceil(flatten_data.size / 8)))): @@ -66,7 +66,7 @@ class Visualization: except json.JSONDecodeError as e: self.util.log.error("%s %s in line %s" % ("JSONDecodeError", str(e), pkl_line)) self.util.log.warning("Please check the pkl file") - raise ParseException(ParseException.PARSE_JSONDECODE_ERROR) from e + raise ParseException(ParseException.PARSE_JSONDECODE_ERROR) info_prefix = msg[0] if not info_prefix.startswith(api_name): continue diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_hooks.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_hooks.py index 82f3d8dfed..7874d3c2fa 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_hooks.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_hooks.py @@ -17,7 +17,7 @@ class TestUtilsMethods(unittest.TestCase): self.assertTrue(dump_util.dump_init_enable) self.assertEqual(dump_util.dump_switch_scope, []) self.assertEqual(dump_util.dump_api_list, []) - self.assertEqual(dump_util.dump_filter_switch, "OFF") + self.assertEqual(dump_util.dump_filter_switch, switch_on) self.assertEqual(dump_count, 0) def test_set_dump_switch_mode_is_list(self): -- Gitee From 6b3d1d5bc55acf9092470e4e7783cd21201b5f22 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Wed, 1 Nov 2023 06:41:35 +0000 Subject: [PATCH 4/4] Revert "clearcode" This reverts commit 9abe240fe4882955907f19d8052533fde8518f6b. --- .../src/python/ptdbg_ascend/__init__.py | 1 - .../src/python/ptdbg_ascend/common/log.py | 1 - .../src/python/ptdbg_ascend/common/utils.py | 3 -- .../compare/distributed_compare.py | 3 +- .../debugger/precision_debugger.py | 1 - .../src/python/ptdbg_ascend/dump/dump.py | 10 ++-- .../src/python/ptdbg_ascend/dump/utils.py | 4 +- .../ptdbg_ascend/hook_module/hook_module.py | 2 - .../hook_module/wrap_npu_custom.py | 1 - .../ptdbg_ascend/online_dispatch/dispatch.py | 6 +-- .../online_dispatch/dump_compare.py | 4 +- .../ptdbg_ascend/overflow_check/info_dump.py | 7 ++- .../overflow_check/overflow_check.py | 17 +++---- .../parse_tool/lib/parse_exception.py | 4 +- .../ptdbg_ascend/src/python/setup.py | 49 +++++++++---------- 15 files changed, 44 insertions(+), 69 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py index 4c3228c478..920deafde9 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py @@ -30,7 +30,6 @@ from .common.utils import seed_all, torch_without_guard_version, print_info_log from .debugger.precision_debugger import PrecisionDebugger seed_all() - def jit_script(obj, optimize=None, _frames_up=0, _rcb=None, example_input=None): print_info_log("The torch_npu earlier than 2.1 does not support torch.jit.script. " "Therefore, to ensure that the dump data of the GPU and NPU is consistent, " diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py index 32c3423551..a7b419866d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py @@ -2,7 +2,6 @@ import os import time import sys - def _print_log(level, msg, end='\n'): current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) pid = os.getgid() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index dfdaa0fc00..30fe40a95b 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -281,7 +281,6 @@ def check_switch_valid(switch): print_error_log("Please set switch with 'ON' or 'OFF'.") raise CompareException(CompareException.INVALID_PARAM_ERROR) - def check_dump_mode_valid(dump_mode): if not isinstance(dump_mode, list): print_warn_log("Please set dump_mode as a list.") @@ -296,14 +295,12 @@ def check_dump_mode_valid(dump_mode): return ["forward", "backward", "input", "output"] return dump_mode - def check_summary_only_valid(summary_only): if not isinstance(summary_only, bool): print_error_log("Params summary_only only support True or False.") raise CompareException(CompareException.INVALID_PARAM_ERROR) return summary_only - def check_compare_param(input_parma, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False): # 添加默认值来让不传参时能通过参数检查 if not (isinstance(input_parma, dict) and isinstance(output_path, str) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py index c58db903dc..d92b0a145b 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -import os -import sys +import os, sys import re from ..common.utils import print_error_log, CompareException, check_compare_param from .acc_compare import compare_core diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py index b2cfdaba44..79066d6ce3 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py @@ -108,7 +108,6 @@ class PrecisionDebugger: PrecisionDebugger.step() PrecisionDebugger.start() - def iter_tracer(func): def func_wrapper(*args, **kwargs): PrecisionDebugger.stop() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 5238754cce..dcf2a8b2da 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -18,11 +18,11 @@ import inspect import json import os -import threading -from pathlib import Path - import numpy as np import torch +import threading + +from pathlib import Path try: import torch_npu @@ -47,7 +47,6 @@ pkl_name = "" rank = os.getpid() multi_output_apis = ["_sort_", "npu_flash_attention"] - class DataInfo(object): def __init__(self, data, save_data, summary_data, dtype, shape): self.data = data @@ -184,7 +183,6 @@ def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file): if 'output' in DumpUtil.dump_mode: dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file) - def rename_(): global rank global pkl_name @@ -200,7 +198,6 @@ def rename_(): os.rename(dir_name, new_name) pkl_name = os.path.join(new_name, file_name) - def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): dump_file = DumpUtil.get_dump_path() dump_file = modify_dump_path(dump_file, DumpUtil.dump_switch_mode) @@ -359,6 +356,5 @@ def write_to_disk(): change_mode(pkl_name, FileCheckConst.DATA_FILE_AUTHORITY) api_list = [] - def get_pkl_file_path(): return pkl_name diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index 4a8a03620e..35f5ad4a90 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -73,12 +73,10 @@ class DumpUtil(object): def check_list_or_acl_mode(name_prefix): global dump_count - result = False for item in DumpUtil.dump_switch_scope: if name_prefix.startswith(item): dump_count = dump_count + 1 - result = True - return result + return True def check_range_mode(name_prefix): global range_begin_flag diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py index 83f7dcacef..a3cb10bf4f 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py @@ -24,10 +24,8 @@ import torch.utils.hooks as full_hooks g_stop_hook = False - class HOOKModule(nn.Module): module_count = {} - def __init__(self, hook) -> None: super(HOOKModule, self).__init__() self.has_overflow = False diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py index f2e1e8f9d3..a507805f0b 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py @@ -48,7 +48,6 @@ class NpuOPTemplate(HOOKModule): else: return getattr(torch_npu._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs) - def wrap_npu_op(op_name, hook): def npu_op_template(*args, **kwargs): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py index 1b16502fb7..53f104d45e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py @@ -1,12 +1,10 @@ import os import time -from pathlib import Path -from multiprocessing import Manager, Pool - import yaml import json +from pathlib import Path +from multiprocessing import Manager, Pool import torch - from torch.utils._python_dispatch import TorchDispatchMode try: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py index 2f0818ffe8..a4740fef88 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py @@ -44,11 +44,11 @@ class TimeStatistics: def __enter__(self): if self.debug: - self.time = datetime.now().astimezone() + self.time = datetime.now() def __exit__(self, exc_type, exc_val, exc_tb): if self.debug: - cost_time = datetime.now().astimezone() - self.time + cost_time = datetime.now() - self.time time_cost = f'Time[{self.tag}]: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ f'Id[{self.index}], time[{cost_time}]' hot_time_cost = "Hotspot " + time_cost diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py index 204833161c..22040a62aa 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py @@ -1,11 +1,11 @@ import inspect import fcntl +import json import os +import torch import threading -import json import numpy as np -import torch from ..common.utils import print_error_log, get_time from ..common.file_check_util import FileOpen @@ -170,8 +170,7 @@ class ForwardAPIInfo(APIInfo): def analyze_api_call_stack(self): stack_str = [] for (_, path, line, func, code, _) in inspect.stack()[3:]: - if not code: - continue + if not code: continue stack_line = " ".join([ "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]), " ".join(["\n", code[0].strip()])])]) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py index d1e9c9637e..f389081807 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py @@ -1,7 +1,12 @@ import os -from pathlib import Path - import torch +from pathlib import Path +from ..common.utils import print_warn_log, get_time, print_info_log +from ..dump.dump import forward_init_status, forward_acl_dump +from .utils import OverFlowUtil, dump_overflow +from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist +from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo +from ..dump import dump try: import torch_npu @@ -10,13 +15,6 @@ except ImportError: else: is_gpu = False -from ..common.utils import print_warn_log, get_time, print_info_log -from ..dump.dump import forward_init_status, forward_acl_dump -from .utils import OverFlowUtil, dump_overflow -from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist -from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo -from ..dump import dump - backward_init_status = False api_overflow = [] forward_api_info = {} @@ -77,7 +75,6 @@ def check_data_overflow(x): def check_path(apis, path): return any(api in path for api in apis) - def overflow_check(name, **kwargs): overflow_nums = OverFlowUtil.overflow_nums pid = kwargs.get('pid') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py index 380d84cb2c..a82a5106ff 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py @@ -39,14 +39,12 @@ def catch_exception(func): def inner(*args, **kwargs): log = logging.getLogger() line = args[-1] if len(args) == 2 else "" - result = None try: - result = func(*args, **kwargs) + return func(*args, **kwargs) except OSError: log.error("%s: command not found" % line) except ParseException: log.error("Command execution failed") except SystemExit: log.warning("Please enter the correct command") - return result return inner diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py index 02755e8e1f..fecc868429 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py @@ -15,37 +15,36 @@ # limitations under the License. """ -import os -import stat -from pathlib import Path import setuptools +from pathlib import Path +import stat +import os VERSION = '3.0' - def generate_ptdbg_ascend_version(): - ptdbg_ascend_root = Path(__file__).parent - version_path = ptdbg_ascend_root / "ptdbg_ascend" / "common" / "version.py" - if version_path.exists(): - version_path.unlink() - flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL - modes = stat.S_IWUSR | stat.S_IRUSR - with os.fdopen(os.open(version_path, flags, modes), 'w') as f: - f.write("__version__ = '{version}'\n".format(version = VERSION)) + ptdbg_ascend_root = Path(__file__).parent + version_path = ptdbg_ascend_root / "ptdbg_ascend" / "common" / "version.py" + if version_path.exists(): + version_path.unlink() + flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL + modes = stat.S_IWUSR | stat.S_IRUSR + with os.fdopen(os.open(version_path, flags, modes), 'w') as f: + f.write("__version__ = '{version}'\n".format(version = VERSION)) generate_ptdbg_ascend_version() setuptools.setup(name='ptdbg_ascend', - version=VERSION, - description='This is a pytorch precision comparison tools', - long_description='This is a pytorch precision comparison tools, include overflow detect tool', - packages=setuptools.find_packages(), - install_requires = [ - "wheel", - "numpy", - "pandas >= 1.3.5", - "pyyaml" - ], - include_package_data=True, - ext_modules=[], - zip_safe=False) + version=VERSION, + description='This is a pytorch precision comparison tools', + long_description='This is a pytorch precision comparison tools, include overflow detect tool', + packages=setuptools.find_packages(), + install_requires = [ + "wheel", + "numpy", + "pandas >= 1.3.5", + "pyyaml" + ], + include_package_data=True, + ext_modules=[], + zip_safe=False) -- Gitee