diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py index 754152169e6465ff3b80a445fe7a53ec13cb421a..bd91f33a3ed70ee35cc54fc618d714fc562fd9dd 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py @@ -20,9 +20,9 @@ import pandas as pd from .advisor_result import AdvisorResult from .advisor_const import AdvisorConst -from ..common import utils -from ..common.utils import CompareException, CompareConst, Const +from ..common.utils import CompareException, CompareConst from ..common.utils import print_info_log, print_warn_log, print_error_log +from ..common.file_check_util import FileChecker, FileCheckConst class Advisor: @@ -35,9 +35,6 @@ class Advisor: self.out_path = os.path.realpath(out_path) def _parse_input_file(self): - if not self.input_file.endswith(".csv"): - print_error_log("Advisor only support csv file from ptdbg_ascend result.") - raise CompareException(CompareException.INVALID_FILE_ERROR) try: df = pd.read_csv(self.input_file, on_bad_lines='skip') except OSError as os_err: @@ -54,9 +51,12 @@ class Advisor: df.iloc[:, 0] += 2 return df - def _check_result_file(self): - utils.check_file_or_directory_path(self.input_file) - utils.check_file_size(self.input_file, Const.ONE_GB) + def _check_path_vaild(self): + input_file_checker = FileChecker(self.input_file, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.CSV_SUFFIX) + input_file_checker.common_check() + out_path_checker = FileChecker(self.out_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + out_path_checker.common_check() def gen_advisor_message(self, node_name): if AdvisorConst.FORWARD in node_name: @@ -107,7 +107,7 @@ class Advisor: return result def analysis(self): - self._check_result_file() + self._check_path_vaild() analyze_data = self._parse_input_file() print_info_log("Start analyzing the comparison result: %s" % self.input_file) self.analyze_unmatched(analyze_data) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py index 95a6774763c5d70dccd0dc67131c329dd1d15480..a098e0b82573167b86e48bf7f841669402563814 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py @@ -20,6 +20,7 @@ import time from .advisor_const import AdvisorConst from ..common.utils import Const from ..common.utils import print_info_log, print_error_log +from ..common.file_check_util import change_mode, FileCheckConst class AdvisorResult: @@ -41,6 +42,7 @@ class AdvisorResult: output_file.truncate(0) message_list = [message + AdvisorConst.NEW_LINE for message in message_list] output_file.writelines(message_list) + change_mode(result_file, FileCheckConst.DATA_FILE_AUTHORITY) except IOError as io_error: print_error_log("Failed to save %s, the reason is %s." % (result_file, io_error)) else: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py index b8f7f4ed2dfaabf5c20ddce46ee606f96bfaa6f1..2ecdc79bf3e8d225b36954438985f6dce42ac146 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py @@ -17,7 +17,7 @@ import os import re -from .utils import print_warn_log, print_error_log +from .log import print_warn_log, print_error_log class FileCheckConst: @@ -34,14 +34,26 @@ class FileCheckConst: NUMPY_SUFFIX = ".npy" JSON_SUFFIX = ".json" PT_SUFFIX = ".pt" + CSV_SUFFIX = ".csv" + YAML_SUFFIX = ".yaml" MAX_PKL_SIZE = 1 * 1024 * 1024 * 1024 MAX_NUMPY_SIZE = 10 * 1024 * 1024 * 1024 MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024 MAX_PT_SIZE = 10 * 1024 * 1024 * 1024 + MAX_CSV_SIZE = 1 * 1024 * 1024 * 1024 + MAX_YAML_SIZE = 10 * 1024 * 1024 DIR = "dir" FILE = "file" DATA_DIR_AUTHORITY = 0o750 DATA_FILE_AUTHORITY = 0o640 + FILE_SIZE_DICT = { + PKL_SUFFIX: MAX_PKL_SIZE, + NUMPY_SUFFIX: MAX_NUMPY_SIZE, + JSON_SUFFIX: MAX_JSON_SIZE, + PT_SUFFIX: MAX_PT_SIZE, + CSV_SUFFIX: MAX_CSV_SIZE, + YAML_SUFFIX: MAX_YAML_SIZE + } class FileCheckException(Exception): @@ -94,6 +106,7 @@ class FileChecker: check_link(self.file_path) check_path_length(self.file_path) check_path_exists(self.file_path) + check_path_type(self.file_path, self.path_type) self.check_path_ability() check_path_owner_consistent(self.file_path) check_path_pattern_vaild(self.file_path) @@ -234,14 +247,10 @@ def check_file_size(file_path, max_size): def check_common_file_size(file_path): if os.path.isfile(file_path): - if file_path.endswith(FileCheckConst.PKL_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_PKL_SIZE) - if file_path.endswith(FileCheckConst.NUMPY_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_NUMPY_SIZE) - if file_path.endswith(FileCheckConst.JSON_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_JSON_SIZE) - if file_path.endswith(FileCheckConst.PT_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_PT_SIZE) + for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): + if file_path.endswith(suffix): + check_file_size(file_path, max_size) + break def check_file_suffix(file_path, file_suffix): @@ -252,7 +261,7 @@ def check_file_suffix(file_path, file_suffix): raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR) -def check_file_type(file_path, file_type): +def check_path_type(file_path, file_type): real_path = os.path.realpath(file_path) if file_type == FileCheckConst.FILE: if not os.path.isfile(real_path): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py new file mode 100644 index 0000000000000000000000000000000000000000..a7b419866d3b01551db4aa90a29b65c4ae1a7f33 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py @@ -0,0 +1,39 @@ +import os +import time +import sys + +def _print_log(level, msg, end='\n'): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getgid() + print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end) + sys.stdout.flush() + + +def print_info_log(info_msg, end='\n'): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg, end=end) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) \ No newline at end of file diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index ecfc420fdf580e393c31c7d477a05bbde3c61f95..e6787c467e0dc964f8d7184891d4e762288f3da1 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -29,6 +29,8 @@ from pathlib import Path import numpy as np import torch +from .file_check_util import FileOpen, FileChecker, FileCheckConst + try: import torch_npu except ImportError: @@ -332,29 +334,10 @@ def check_file_or_directory_path(path, isdir=False): when invalid data throw exception """ if isdir: - if not os.path.exists(path): - print_error_log('The path {} is not exist.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.path.isdir(path): - print_error_log('The path {} is not a directory.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.access(path, os.W_OK): - print_error_log( - 'The path {} does not have permission to write. Please check the path permission'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) + path_checker = FileChecker(path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) else: - if not os.path.isfile(path): - print_error_log('{} is an invalid file or non-exist.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - check_file_valid(path) - - if not os.access(path, os.R_OK): - print_error_log( - 'The path {} does not have permission to read. Please check the path permission'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) + path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE) + path_checker.common_check() def _check_pkl(pkl_file_handle, file_name): @@ -632,7 +615,7 @@ def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode): is_api_stack = "True" if dump_switch_mode == Const.API_STACK else "False" try: - with open(template_path, 'r') as ftemp, \ + with FileOpen(template_path, 'r') as ftemp, \ os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout: code_temp = ftemp.read() fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack)) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py index 8aa4e6974e74ce2b6037a2e81569a71561791c2b..f7d2e869417b85069ceed6e72b4a0c28f153ce64 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py @@ -1 +1 @@ -__version__ = '4.0' \ No newline at end of file +__version__ = '3.0' diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py index ea581a95aed1fb58311cac0a3df04c242e6cd5c2..1fac930d60ea17a38d55e57c2da800c95324c321 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py @@ -27,7 +27,8 @@ import pandas as pd from ..advisor.advisor import Advisor from ..common.utils import check_compare_param, add_time_as_suffix, \ print_warn_log, print_error_log, CompareException, Const,\ - CompareConst, format_value, check_file_not_exists, check_file_valid + CompareConst, format_value, check_file_not_exists +from ..common.file_check_util import FileChecker, FileCheckConst, change_mode def correct_data(result): @@ -88,13 +89,13 @@ def get_max_relative_err(n_value, b_value): np.seterr(divide='ignore', invalid='ignore') if b_value.dtype in CompareConst.FLOAT_TYPE: zero_mask = (b_value == 0) - b_value[zero_mask] += np.finfo(b_value.dtype).eps - n_value[zero_mask] += np.finfo(b_value.dtype).eps + b_value[zero_mask] += np.finfo(b_value.dtype).eps + n_value[zero_mask] += np.finfo(b_value.dtype).eps else: n_value, b_value = n_value.astype(float), b_value.astype(float) zero_mask = (b_value == 0) - b_value[zero_mask] += np.finfo(float).eps - n_value[zero_mask] += np.finfo(float).eps + b_value[zero_mask] += np.finfo(float).eps + n_value[zero_mask] += np.finfo(float).eps relative_err = np.divide((n_value - b_value), b_value) max_relative_err = np.max(np.abs(relative_err)) if np.isnan(max_relative_err): @@ -144,7 +145,7 @@ def check_type_shape_match(npu_struct, bench_struct): shape_match = npu_shape == bench_shape type_match = npu_type == bench_type if not type_match: - if [npu_type, bench_type] in [["torch.float16", "torch.float32"], ["torch.float32", "torch.float16"], + if [npu_type, bench_type] in [["torch.float16", "torch.float32"], ["torch.float32", "torch.float16"], ["torch.float16", "torch.bfloat16"], ["torch.bfloat16", "torch.float16"]]: type_match = True else: @@ -412,8 +413,12 @@ def compare_by_op(op_name, op_name_mapping_dict, input_parma): try: n_path = os.path.join(input_parma.get("npu_dump_data_dir"), npu_bench_name_list[0] + ".npy") b_path = os.path.join(input_parma.get("bench_dump_data_dir"), npu_bench_name_list[1] + ".npy") - check_file_valid(n_path) - check_file_valid(b_path) + n_path_checker = FileChecker(n_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX) + b_path_checker = FileChecker(b_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX) + n_path = n_path_checker.common_check() + b_path = b_path_checker.common_check() n_value = np.load(n_path) b_value = np.load(b_path) except IOError as error: @@ -434,11 +439,10 @@ def compare_by_op(op_name, op_name_mapping_dict, input_parma): err_msg = " Dtype of NPU and bench Tensor do not match." else: err_msg = "" - + n_value, b_value = handle_inf_nan(n_value, b_value) if n_value is CompareConst.NAN or b_value is CompareConst.NAN: return "N/A", "N/A", "N/A", "The position of inf or nan in NPU and bench Tensor do not match." - n_value = n_value.reshape(-1).astype(float) b_value = b_value.reshape(-1).astype(float) @@ -509,6 +513,7 @@ def compare_core(input_parma, output_path, npu_pkl, bench_pkl, stack_mode=False, result_df.to_csv(fout, index=False) _do_multi_process(input_parma, file_path) + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) if auto_analyze: advisor = Advisor(file_path, output_path) advisor.analysis() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py index 599735823e6191e37f72952b4f23341929054098..de1ff96b2633bd4e123e963c00c10538c92d72ba 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py @@ -45,9 +45,7 @@ class PrecisionDebugger: set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=input_output_mode, summary_only=summary_only) if mode == 'acl': - if not acl_config: - raise ValueError("acl_config must be configured when mode is 'acl'") - DumpUtil.dump_config = acl_config + DumpUtil.set_acl_config(acl_config) if not scope or not isinstance(scope, list) or len(scope) != 1: raise ValueError("scope must be congfigured as a list with one api name") if isinstance(scope[0], str) and 'backward' in scope[0] and not backward_input: @@ -58,9 +56,7 @@ class PrecisionDebugger: def configure_overflow_dump(self, mode="api", acl_config=None, overflow_nums=1, filter_switch = Const.OFF): if mode == "acl": DumpUtil.dump_switch_mode = mode - DumpUtil.dump_config = acl_config - if acl_config is None: - raise ValueError("acl_config must be configured when mode is 'acl'") + DumpUtil.set_acl_config(acl_config) init_overflow_nums(overflow_nums) check_switch_valid(filter_switch) OverFlowUtil.overflow_filter_switch = filter_switch diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 41e63cb0cdaeef8efe70a483e2df1fd098989cf9..dcf2a8b2da6e3dbe6e3a5ca3d917f498d56cc27b 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -34,6 +34,7 @@ else: from .utils import DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path from ..dump.utils import check_writable +from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_vaild, check_path_length forward_init_status = False backward_init_status = False @@ -134,8 +135,11 @@ def dump_data(dump_file_name, dump_step, prefix, data_info): try: if json_dump_condition(prefix): output_path = os.path.join(DumpUtil.dump_data_dir, f'{prefix}.npy') + check_path_length(output_path) + check_path_pattern_vaild(output_path) if not DumpUtil.summary_only: np.save(output_path, data_info.save_data) + change_mode(output_path, FileCheckConst.DATA_FILE_AUTHORITY) api_list.append([prefix, dump_step, [], data_info.dtype, data_info.shape, data_info.summary_data]) print_info_log(f"ptdbg is dumping rank{rank} api: {prefix}" + " " * 10, end='\r') except Exception as e: @@ -188,7 +192,7 @@ def rename_(): new_name = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(rank)) else: dir_name = os.path.join(DumpUtil.dump_root, "rank{}".format(os.getpid())) - new_name = os.path.join(DumpUtil.dump_root, "rank{}".format(rank)) + new_name = os.path.join(DumpUtil.dump_root, "rank{}".format(rank)) if not os.path.exists(new_name) and os.path.exists(dir_name): _, file_name = os.path.split(pkl_name) os.rename(dir_name, new_name) @@ -203,14 +207,14 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): global rank dump_dir, dump_filename = os.path.split(dump_file) if DumpUtil.target_iter: - dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num)) + dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num)) if not os.path.exists(dump_dir): - Path(dump_dir).mkdir(mode=0o750, exist_ok=True) + Path(dump_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) dump_file = os.path.join(dump_dir, dump_filename) rank_this = get_tensor_rank(in_feat, out_feat) DumpUtil.dump_root = os.path.dirname(DumpUtil.dump_path) if rank_this is not None and rank != rank_this: - rank = rank_this + rank = rank_this rename_() if not DumpUtil.dump_init_enable: if '.pkl' in dump_filename: @@ -225,6 +229,8 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): if rank != DumpUtil.target_rank: return dump_file = create_dirs_if_not_exist(rank, dump_file) + check_path_pattern_vaild(dump_file) + check_path_length(dump_file) global pkl_name pkl_name = dump_file if DumpUtil.dump_init_enable: @@ -341,12 +347,13 @@ def acc_cmp_dump(name, **kwargs): def write_to_disk(): global api_list if api_list: - with open(pkl_name, 'a') as f: + with FileOpen(pkl_name, 'a') as f: try: f.write('\n'.join(json.dumps(item) for item in api_list)) f.write('\n') except: raise Exception("write to disk failed") + change_mode(pkl_name, FileCheckConst.DATA_FILE_AUTHORITY) api_list = [] def get_pkl_file_path(): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index 0f65156e03f0ca17e1b6fca35061db5133a5dcd7..87904bdb8cd5bdd2241aab6a0f7cdbcba5972f80 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -9,6 +9,7 @@ from ..dump import dump from ..common.utils import print_error_log, CompareException, DumpException, Const, get_time, print_info_log, \ check_mode_valid, get_api_name_from_matcher, check_switch_valid, check_dump_mode_valid, check_summary_only_valid, generate_compare_script, \ check_is_npu, check_file_valid, make_dump_path_if_not_exists +from ..common.file_check_util import FileChecker, FileCheckConst, check_path_length, check_path_pattern_vaild from ..common.version import __version__ @@ -42,8 +43,13 @@ class DumpUtil(object): DumpUtil.dump_init_enable = True @staticmethod - def set_dump_config(dump_config): - DumpUtil.dump_config = dump_config + def set_acl_config(acl_config): + if not acl_config: + raise ValueError("acl_config must be configured when mode is 'acl'") + acl_config_checker = FileChecker(acl_config, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.JSON_SUFFIX) + acl_config = acl_config_checker.common_check() + DumpUtil.dump_config = acl_config @staticmethod def set_dump_switch(switch, mode=None, scope=None, api_list=None, filter_switch=None, dump_mode=None, summary_only=False): @@ -138,6 +144,8 @@ def set_dump_path(fpath=None, dump_tag='ptdbg_dump'): raise CompareException(CompareException.INVALID_PATH_ERROR) real_path = os.path.realpath(fpath) make_dump_path_if_not_exists(real_path) + fpath_checker = FileChecker(real_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + fpath_checker.common_check() DumpUtil.set_dump_path(real_path) DumpUtil.dump_dir_tag = dump_tag @@ -169,7 +177,7 @@ def create_dirs_if_not_exist(rank, dump_file): rank_dir = os.path.join(dump_path, f"rank{rank}") dump_file = os.path.join(rank_dir, file_name) if not os.path.isdir(rank_dir): - Path(rank_dir).mkdir(mode=0o750, exist_ok=True) + Path(rank_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) return dump_file @@ -258,6 +266,8 @@ def make_dump_dirs(): dump_file_name, dump_file_name_body = "dump.pkl", "dump" dump_root_dir = load_env_dump_path(DumpUtil.dump_path) tag_dir = os.path.join(dump_root_dir, DumpUtil.dump_dir_tag + f'_v{__version__}') + check_path_length(tag_dir) + check_path_pattern_vaild(tag_dir) Path(tag_dir).mkdir(mode=0o750, parents=True, exist_ok=True) DumpUtil.dump_dir = tag_dir dump_file_path = os.path.join(tag_dir, dump_file_name) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py index 8a827d94d32f70906a134c3d23dd82d48bfca80a..8f67a41a40779a20116a73fbc597d631f13e6b12 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py @@ -112,7 +112,7 @@ def register_hook(model, hook, **kwargs): dump_mode, dump_config_file = init_dump_config(kwargs) if dump_mode == 'acl': DumpUtil.dump_switch_mode = dump_mode - DumpUtil.dump_config = dump_config_file + DumpUtil.set_acl_config(dump_config_file) register_hook_core(hook) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py index 3e8e33f225f2c76c30c61a97f580cb5e7f1ee84d..3b746e9c5f3bb5fd726ad3c41ae49a675b33751a 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py @@ -22,11 +22,12 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapAtenOps = yaml.safe_load(f).get('aten') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py index c96c0efe317b2340e5b6650d31d20cae669bb6e7..7aa21770cefe569f040e60cbad25604af826b627 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py @@ -22,11 +22,12 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapDistributedOps = yaml.safe_load(f).get('distributed') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py index c6e119a8bf319e12d13bb0e1f6e33ef47e1bf713..1ce938129628aefebfa7a991ce166c47159cda0d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py @@ -22,6 +22,8 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard, print_info_log +from ..common.file_check_util import FileOpen + def remove_dropout(): if torch.__version__ > "1.8": @@ -62,7 +64,7 @@ def remove_dropout(): cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapFunctionalOps = yaml.safe_load(f).get('functional') for f in dir(torch.nn.functional): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py index 4e127c87b7b1ddb170b3d89e4c0843521f93c68b..a507805f0b5fb2b5cf05579b0beaf86cf202d76c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard, torch_without_guard_version +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapNpuOps = yaml.safe_load(f).get('torch_npu') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py index f87b400dc9f77c30101cd6286e3fb89cb1282cf9..c5e3321bb8303fb4d92ff4c14b9a0dcb5df640b9 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard, parameter_adapter +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapTensorOps = yaml.safe_load(f).get('tensor') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py index e69a89d9efdeece2e05485752bb9b01b4504aa0f..903ff92c583db3169cac2b852c5bafebb521fefa 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapTorchOps = yaml.safe_load(f).get('torch') @@ -52,7 +53,7 @@ class TorchOPTemplate(HOOKModule): if item in self.op_name_: return True return False - + def einsum_adapt(self, *args): if len(args) < 2: raise ValueError('einsum(): must specify the equation string and at least one operand, ' @@ -69,7 +70,7 @@ class TorchOPTemplate(HOOKModule): return chr(ord('a') + n - 26) raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52]') equation = ','.join(''.join(parse_subscript(s) for s in l) for l in args[1::2]) - + if len(args) % 2 == 1: equation += '->' + ''.join(parse_subscript(s) for s in args[-1]) operands = args[:-1:2] diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py index b01f28fea765e947c64f160ce5f1a1e1d69fb9c5..1d055b090ab8e78cc0ddf93bed0820ea5767a6f4 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapVfOps = yaml.safe_load(f).get('_VF') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py index 6b3c98c33e8d8b607ff441ea0086dfab202ea15e..328b25b88851a4548f1f9455923ad5eed4dd4a00 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py @@ -19,6 +19,7 @@ from ..common.version import __version__ from .dump_compare import dispatch_workflow, dispatch_multiprocess, error_call, TimeStatistics, \ DispatchRunParam, save_csv from .utils import get_callstack, data_to_cpu, logger_debug, logger_error, logger_warn, logger_logo, get_sys_info +from ..common.file_check_util import FileOpen class PtdbgDispatch(TorchDispatchMode): @@ -52,7 +53,7 @@ class PtdbgDispatch(TorchDispatchMode): dir_name = f'ptdbg_v{__version__}_{tag}_rank{self.device_id}_{time_now}' self.root_path = os.path.join(os.path.realpath(dump_path), dir_name) self.root_cpu_path = os.path.join(self.root_path, f'cpu') - self.root_npu_path = os.path.join(self.root_path, f'npu') + self.root_npu_path = os.path.join(self.root_path, f'npu') file_name = add_time_as_suffix(f'compare_result_rank{self.device_id}') self.csv_path = os.path.join(self.root_path, file_name) Path(self.root_cpu_path).mkdir(mode=0o750, parents=True, exist_ok=True) @@ -60,7 +61,7 @@ class PtdbgDispatch(TorchDispatchMode): self.aten_ops_blacklist = [] yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "unsupport_torch_ops.yaml") - with open(yaml_path, 'r') as f: + with FileOpen(yaml_path, 'r') as f: self.aten_ops_blacklist = yaml.safe_load(f).get('aten') self.process_num = process_num @@ -82,10 +83,10 @@ class PtdbgDispatch(TorchDispatchMode): for aten_api in api_list: if aten_api in aten_api_list: dump_api_list.append(aten_api) - else: + else: logger_warn(f'{aten_api} is not aten api will not dump, please refer to torch.ops.aten') return dump_api_list - + def get_dump_flag(self, aten_api): dump_flag = False auto_dump_flag = False @@ -107,7 +108,7 @@ class PtdbgDispatch(TorchDispatchMode): run_param.func_namespace = "aten" return True return False - + def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) @@ -131,12 +132,12 @@ class PtdbgDispatch(TorchDispatchMode): if len(json_line_data) == 0: break msg = json.loads(json_line_data) - self.all_summery[msg[0]] = msg[1] + self.all_summery[msg[0]] = msg[1] fp_handle.close() if self.debug_flag: input_num = 0 - output_num = 0 + output_num = 0 total_num = 0 for list_data in self.all_summery: @@ -149,10 +150,10 @@ class PtdbgDispatch(TorchDispatchMode): total_num = total_num + 1 logger_debug(f'Dispatch exit: Device[{self.device_id}], Pid[{os.getpid()} Input[{input_num}] ' f'Output[{output_num}] Total[{total_num}] API_Total[{self.api_index}]]') - + save_csv(self.all_summery, self.call_stack_list, self.csv_path) - def __torch_dispatch__(self, func, types, args=(), kwargs=None): + def __torch_dispatch__(self, func, types, args=(), kwargs=None): if not is_npu: logger_error("Please confirm you run environment installed torch_npu!") return func(*args, **kwargs) @@ -179,7 +180,7 @@ class PtdbgDispatch(TorchDispatchMode): run_param.aten_api = aten_api run_param.aten_api_overload_name = aten_api_overload_name run_param.single_api_index = self.single_api_index_dict[aten_api] - run_param.api_index = self.api_index + run_param.api_index = self.api_index if self.debug_flag: logger_debug(f'Dispatch Info: Rank[{self.device_id}], Pid[{os.getpid()}], Func[{func.__name__}], ' @@ -192,7 +193,7 @@ class PtdbgDispatch(TorchDispatchMode): data_to_cpu(kwargs, 0, cpu_kwargs) cpu_args = cpu_args[0] cpu_kwargs = cpu_kwargs[0] - + with TimeStatistics("NPU RUN", run_param): npu_out = func(*args, **kwargs) npu_out_cpu = [] @@ -204,7 +205,7 @@ class PtdbgDispatch(TorchDispatchMode): run_param.process_flag = False dispatch_workflow(run_param, cpu_args, cpu_kwargs, self.all_summery, func, npu_out_cpu, self.lock) else: - self.lock.acquire() + self.lock.acquire() self.all_summery.append([]) self.lock.release() run_param.process_flag = True diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py index 0dc7555f5a741578306436a07e80a809b10f67d8..a4740fef8843e22d647888dd86bef3b6629256b1 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py @@ -9,6 +9,7 @@ from ..common.utils import Const, CompareConst, add_time_as_suffix from ..compare.acc_compare import cosine_similarity, get_max_abs_err, get_max_relative_err, check_accuracy from .utils import np_save_data, logger_debug, logger_error, logger_user, COLOR_RED, COLOR_GREEN, COLOR_RESET, \ CSV_COLUMN_NAME +from ..common.file_check_util import FileOpen, change_mode, FileCheckConst class DispatchRunParam: @@ -19,7 +20,7 @@ class DispatchRunParam: self.root_npu_path = root_npu_path self.root_cpu_path = root_cpu_path self.process_num = process_num - self.process_flag = None + self.process_flag = None self.func_name = None self.func_namespace = None self.aten_api = None @@ -66,11 +67,11 @@ def get_compare_result(npu_data, cpu_data): # Do not check dtype, there maybe type cast if npu_npy.size == 0 or cpu_npy.size == 0: return "unsupported", 0, 0, "This is empty data, can not compare." - + if npu_npy.shape != cpu_npy.shape: return CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, \ "Shape of NPU and bench Tensor do not match. Skipped." - + npu_npy = npu_npy.reshape(-1).astype(float) cpu_npy = cpu_npy.reshape(-1).astype(float) err_msg = "" @@ -78,7 +79,7 @@ def get_compare_result(npu_data, cpu_data): max_relative_err, message = get_max_relative_err(npu_npy, cpu_npy) if npu_npy.shape == 0: return "unsupported", max_abs_err, max_relative_err, "This is type of scalar data, can not compare." - + cos_sim, message = cosine_similarity(npu_npy, cpu_npy) err_msg += message @@ -149,7 +150,7 @@ def save_summery(run_param, npu_data, cpu_data, prefix, summery_list, compute_fl data_dict[CompareConst.ACCURACY] = CompareConst.ACCURACY_CHECK_YES summery_list.append(data_dict) - + if data_dict[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_NO: logger_user(f'rank{run_param.device_id} {prefix} index={run_param.single_api_index}, ' f'overload={run_param.aten_api_overload_name}, shape={data_dict[CompareConst.NPU_SHAPE]} ' @@ -191,7 +192,7 @@ def compare_data(run_param, npu_data, cpu_data, prefix, summery_list, compute_fl return True if type(npu_data) != type(cpu_data): logger_warn(f'{prefix} can not compare npu type={str(type(npu_data))} cpu type={str(type(cpu_data))}') - + return True return save_summery(run_param, npu_data, cpu_data, prefix, summery_list, compute_flag) return True @@ -200,7 +201,7 @@ def compare_data(run_param, npu_data, cpu_data, prefix, summery_list, compute_fl def save_temp_summery(api_index, single_api_summery, path, lock): summery_path = os.path.join(path, f'summery.json') lock.acquire() - with open(summery_path, "a") as f: + with FileOpen(summery_path, "a") as f: json.dump([api_index, single_api_summery], f) f.write('\n') lock.release() @@ -275,8 +276,8 @@ def save_csv(all_summery, call_stack_list, csv_path): for index, list_data in enumerate(all_summery): for data in list_data: csv_row_data = {CompareConst.NPU_NAME: data[CompareConst.NPU_NAME], - CompareConst.BENCH_NAME: data[CompareConst.BENCH_NAME], - CompareConst.NPU_DTYPE: data[CompareConst.NPU_DTYPE], + CompareConst.BENCH_NAME: data[CompareConst.BENCH_NAME], + CompareConst.NPU_DTYPE: data[CompareConst.NPU_DTYPE], CompareConst.BENCH_DTYPE: data[CompareConst.BENCH_DTYPE], CompareConst.NPU_SHAPE: data[CompareConst.NPU_SHAPE], CompareConst.BENCH_SHAPE: data[CompareConst.BENCH_SHAPE], @@ -294,5 +295,6 @@ def save_csv(all_summery, call_stack_list, csv_path): CompareConst.ERROR_MESSAGE: data[CompareConst.ERROR_MESSAGE]} row_df = pd.DataFrame.from_dict(csv_row_data, orient='index').T df = pd.concat([df, row_df]) - + df.to_csv(csv_path, index=False) + change_mode(csv_path, FileCheckConst.DATA_FILE_AUTHORITY) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py index c8a2c5d26ba431d7b96714b57e394f6357b301bc..6a46a592fa0b773ee3e6bfda32a24a3e81e25b4f 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py @@ -8,6 +8,7 @@ import threading import numpy as np from ..common.utils import print_error_log +from ..common.file_check_util import FileOpen special_torch_object = ["memory_format"] @@ -205,10 +206,10 @@ def write_api_info_json(api_info): def write_json(file_path, data, indent=None): if not os.path.exists(file_path): - with open(file_path, 'w') as f: + with FileOpen(file_path, 'w') as f: f.write("{\n}") lock.acquire() - with open(file_path, 'a+') as f: + with FileOpen(file_path, 'a+') as f: fcntl.flock(f, fcntl.LOCK_EX) try: f.seek(0, os.SEEK_END) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py index 71980f3ace07b28ed5152ad56159cd3c5ca7b7ba..f9f31b74c0e058906056c18a8dab3709b8a68a7f 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py @@ -5,7 +5,7 @@ import os import shutil import unittest from ptdbg_ascend.advisor.advisor import Advisor -from ptdbg_ascend.common.utils import CompareException +from ptdbg_ascend.common.file_check_util import FileCheckException class TestAdvisor(unittest.TestCase): @@ -18,11 +18,11 @@ class TestAdvisor(unittest.TestCase): def test_analysis_when_csv_path_is_not_exist(self): advisor = Advisor("resources/compare/test.pkl", self.output_path) - self.assertRaises(CompareException, advisor.analysis) + self.assertRaises(FileCheckException, advisor.analysis) def test_analysis_when_csv_path_is_invalid(self): advisor = Advisor("resources/compare/npu_test_1.pkl", self.output_path) - self.assertRaises(CompareException, advisor.analysis) + self.assertRaises(FileCheckException, advisor.analysis) def test_analysis_when_csv_is_valid(self): advisor = Advisor("resources/compare/compare_result_20230703104808.csv", self.output_path) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py index ae1f3321189bd14898ca10760dfa3670d36b2a36..6adbab471fb6939c1a50b0b53a32fe9ce71a226d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py @@ -3,6 +3,7 @@ import pytest import ptdbg_ascend.common.utils as utils from ptdbg_ascend.common.utils import CompareException +from ptdbg_ascend.common.file_check_util import FileCheckException class TestUtilsMethods(unittest.TestCase): @@ -19,15 +20,15 @@ class TestUtilsMethods(unittest.TestCase): def test_check_file_or_directory_path_1(self): file = "list" - with pytest.raises(CompareException) as error: + with pytest.raises(FileCheckException) as error: utils.check_file_or_directory_path(file) - self.assertEqual(error.value.code, CompareException.INVALID_PATH_ERROR) + self.assertEqual(error.value.code, FileCheckException.INVALID_PATH_ERROR) def test_check_file_or_directory_path_2(self): file = "/list/dir" - with pytest.raises(CompareException) as error: + with pytest.raises(FileCheckException) as error: utils.check_file_or_directory_path(file) - self.assertEqual(error.value.code, CompareException.INVALID_PATH_ERROR) + self.assertEqual(error.value.code, FileCheckException.INVALID_PATH_ERROR) def test_check_file_size_1(self): file = "/list/dir"