diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py index 03e4e545aafb020091a68dd80ed36e3f8ede4004..15453f6c18ee2b337c358eb36851e8f8fedbce23 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py @@ -122,7 +122,7 @@ def _run_ut_parser(parser): help=" The api param tool backward result file: generate from api param tool, " "a json file.", required=False) - parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", + parser.add_argument("-j", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", default=False, required=False) parser.add_argument("-d", "--device", dest="device_id", type=int, help=" set NPU device id to run ut", default=0, required=False) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py index 920deafde9bdaa71b744aa58aee07a8f4ceb1a46..4c3228c4782dcd2fed2ea89cf4567a68678a2b92 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py @@ -30,6 +30,7 @@ from .common.utils import seed_all, torch_without_guard_version, print_info_log from .debugger.precision_debugger import PrecisionDebugger seed_all() + def jit_script(obj, optimize=None, _frames_up=0, _rcb=None, example_input=None): print_info_log("The torch_npu earlier than 2.1 does not support torch.jit.script. " "Therefore, to ensure that the dump data of the GPU and NPU is consistent, " diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py index 754152169e6465ff3b80a445fe7a53ec13cb421a..bd91f33a3ed70ee35cc54fc618d714fc562fd9dd 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py @@ -20,9 +20,9 @@ import pandas as pd from .advisor_result import AdvisorResult from .advisor_const import AdvisorConst -from ..common import utils -from ..common.utils import CompareException, CompareConst, Const +from ..common.utils import CompareException, CompareConst from ..common.utils import print_info_log, print_warn_log, print_error_log +from ..common.file_check_util import FileChecker, FileCheckConst class Advisor: @@ -35,9 +35,6 @@ class Advisor: self.out_path = os.path.realpath(out_path) def _parse_input_file(self): - if not self.input_file.endswith(".csv"): - print_error_log("Advisor only support csv file from ptdbg_ascend result.") - raise CompareException(CompareException.INVALID_FILE_ERROR) try: df = pd.read_csv(self.input_file, on_bad_lines='skip') except OSError as os_err: @@ -54,9 +51,12 @@ class Advisor: df.iloc[:, 0] += 2 return df - def _check_result_file(self): - utils.check_file_or_directory_path(self.input_file) - utils.check_file_size(self.input_file, Const.ONE_GB) + def _check_path_vaild(self): + input_file_checker = FileChecker(self.input_file, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.CSV_SUFFIX) + input_file_checker.common_check() + out_path_checker = FileChecker(self.out_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + out_path_checker.common_check() def gen_advisor_message(self, node_name): if AdvisorConst.FORWARD in node_name: @@ -107,7 +107,7 @@ class Advisor: return result def analysis(self): - self._check_result_file() + self._check_path_vaild() analyze_data = self._parse_input_file() print_info_log("Start analyzing the comparison result: %s" % self.input_file) self.analyze_unmatched(analyze_data) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py index 95a6774763c5d70dccd0dc67131c329dd1d15480..a098e0b82573167b86e48bf7f841669402563814 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py @@ -20,6 +20,7 @@ import time from .advisor_const import AdvisorConst from ..common.utils import Const from ..common.utils import print_info_log, print_error_log +from ..common.file_check_util import change_mode, FileCheckConst class AdvisorResult: @@ -41,6 +42,7 @@ class AdvisorResult: output_file.truncate(0) message_list = [message + AdvisorConst.NEW_LINE for message in message_list] output_file.writelines(message_list) + change_mode(result_file, FileCheckConst.DATA_FILE_AUTHORITY) except IOError as io_error: print_error_log("Failed to save %s, the reason is %s." % (result_file, io_error)) else: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py index b8f7f4ed2dfaabf5c20ddce46ee606f96bfaa6f1..2ecdc79bf3e8d225b36954438985f6dce42ac146 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py @@ -17,7 +17,7 @@ import os import re -from .utils import print_warn_log, print_error_log +from .log import print_warn_log, print_error_log class FileCheckConst: @@ -34,14 +34,26 @@ class FileCheckConst: NUMPY_SUFFIX = ".npy" JSON_SUFFIX = ".json" PT_SUFFIX = ".pt" + CSV_SUFFIX = ".csv" + YAML_SUFFIX = ".yaml" MAX_PKL_SIZE = 1 * 1024 * 1024 * 1024 MAX_NUMPY_SIZE = 10 * 1024 * 1024 * 1024 MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024 MAX_PT_SIZE = 10 * 1024 * 1024 * 1024 + MAX_CSV_SIZE = 1 * 1024 * 1024 * 1024 + MAX_YAML_SIZE = 10 * 1024 * 1024 DIR = "dir" FILE = "file" DATA_DIR_AUTHORITY = 0o750 DATA_FILE_AUTHORITY = 0o640 + FILE_SIZE_DICT = { + PKL_SUFFIX: MAX_PKL_SIZE, + NUMPY_SUFFIX: MAX_NUMPY_SIZE, + JSON_SUFFIX: MAX_JSON_SIZE, + PT_SUFFIX: MAX_PT_SIZE, + CSV_SUFFIX: MAX_CSV_SIZE, + YAML_SUFFIX: MAX_YAML_SIZE + } class FileCheckException(Exception): @@ -94,6 +106,7 @@ class FileChecker: check_link(self.file_path) check_path_length(self.file_path) check_path_exists(self.file_path) + check_path_type(self.file_path, self.path_type) self.check_path_ability() check_path_owner_consistent(self.file_path) check_path_pattern_vaild(self.file_path) @@ -234,14 +247,10 @@ def check_file_size(file_path, max_size): def check_common_file_size(file_path): if os.path.isfile(file_path): - if file_path.endswith(FileCheckConst.PKL_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_PKL_SIZE) - if file_path.endswith(FileCheckConst.NUMPY_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_NUMPY_SIZE) - if file_path.endswith(FileCheckConst.JSON_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_JSON_SIZE) - if file_path.endswith(FileCheckConst.PT_SUFFIX): - check_file_size(file_path, FileCheckConst.MAX_PT_SIZE) + for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): + if file_path.endswith(suffix): + check_file_size(file_path, max_size) + break def check_file_suffix(file_path, file_suffix): @@ -252,7 +261,7 @@ def check_file_suffix(file_path, file_suffix): raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR) -def check_file_type(file_path, file_type): +def check_path_type(file_path, file_type): real_path = os.path.realpath(file_path) if file_type == FileCheckConst.FILE: if not os.path.isfile(real_path): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py new file mode 100644 index 0000000000000000000000000000000000000000..32c3423551febda3358fc51c0aacdc6164b71d2e --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/log.py @@ -0,0 +1,40 @@ +import os +import time +import sys + + +def _print_log(level, msg, end='\n'): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getgid() + print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end) + sys.stdout.flush() + + +def print_info_log(info_msg, end='\n'): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg, end=end) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) \ No newline at end of file diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index ecfc420fdf580e393c31c7d477a05bbde3c61f95..dfdaa0fc003435aadebadbfdf4d0ac94ec59bffe 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -29,6 +29,8 @@ from pathlib import Path import numpy as np import torch +from .file_check_util import FileOpen, FileChecker, FileCheckConst + try: import torch_npu except ImportError: @@ -279,6 +281,7 @@ def check_switch_valid(switch): print_error_log("Please set switch with 'ON' or 'OFF'.") raise CompareException(CompareException.INVALID_PARAM_ERROR) + def check_dump_mode_valid(dump_mode): if not isinstance(dump_mode, list): print_warn_log("Please set dump_mode as a list.") @@ -293,12 +296,14 @@ def check_dump_mode_valid(dump_mode): return ["forward", "backward", "input", "output"] return dump_mode + def check_summary_only_valid(summary_only): if not isinstance(summary_only, bool): print_error_log("Params summary_only only support True or False.") raise CompareException(CompareException.INVALID_PARAM_ERROR) return summary_only + def check_compare_param(input_parma, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False): # 添加默认值来让不传参时能通过参数检查 if not (isinstance(input_parma, dict) and isinstance(output_path, str) @@ -332,29 +337,10 @@ def check_file_or_directory_path(path, isdir=False): when invalid data throw exception """ if isdir: - if not os.path.exists(path): - print_error_log('The path {} is not exist.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.path.isdir(path): - print_error_log('The path {} is not a directory.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.access(path, os.W_OK): - print_error_log( - 'The path {} does not have permission to write. Please check the path permission'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) + path_checker = FileChecker(path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) else: - if not os.path.isfile(path): - print_error_log('{} is an invalid file or non-exist.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - check_file_valid(path) - - if not os.access(path, os.R_OK): - print_error_log( - 'The path {} does not have permission to read. Please check the path permission'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) + path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE) + path_checker.common_check() def _check_pkl(pkl_file_handle, file_name): @@ -632,7 +618,7 @@ def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode): is_api_stack = "True" if dump_switch_mode == Const.API_STACK else "False" try: - with open(template_path, 'r') as ftemp, \ + with FileOpen(template_path, 'r') as ftemp, \ os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout: code_temp = ftemp.read() fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack)) @@ -668,3 +654,14 @@ def check_file_valid(file_path): if file_path.endswith(Const.NUMPY_SUFFIX) and file_size > Const.TEN_GB: print_error_log('The file {} size is greater than 10GB.'.format(file_path)) raise CompareException(CompareException.INVALID_PATH_ERROR) + + +def check_path_before_create(path): + if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \ + Const.FILE_NAME_LENGTH: + print_error_log('The file path length exceeds limit.') + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): + print_error_log('The file path {} contains special characters.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py index 8aa4e6974e74ce2b6037a2e81569a71561791c2b..f7d2e869417b85069ceed6e72b4a0c28f153ce64 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/version.py @@ -1 +1 @@ -__version__ = '4.0' \ No newline at end of file +__version__ = '3.0' diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py index ea581a95aed1fb58311cac0a3df04c242e6cd5c2..1fac930d60ea17a38d55e57c2da800c95324c321 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py @@ -27,7 +27,8 @@ import pandas as pd from ..advisor.advisor import Advisor from ..common.utils import check_compare_param, add_time_as_suffix, \ print_warn_log, print_error_log, CompareException, Const,\ - CompareConst, format_value, check_file_not_exists, check_file_valid + CompareConst, format_value, check_file_not_exists +from ..common.file_check_util import FileChecker, FileCheckConst, change_mode def correct_data(result): @@ -88,13 +89,13 @@ def get_max_relative_err(n_value, b_value): np.seterr(divide='ignore', invalid='ignore') if b_value.dtype in CompareConst.FLOAT_TYPE: zero_mask = (b_value == 0) - b_value[zero_mask] += np.finfo(b_value.dtype).eps - n_value[zero_mask] += np.finfo(b_value.dtype).eps + b_value[zero_mask] += np.finfo(b_value.dtype).eps + n_value[zero_mask] += np.finfo(b_value.dtype).eps else: n_value, b_value = n_value.astype(float), b_value.astype(float) zero_mask = (b_value == 0) - b_value[zero_mask] += np.finfo(float).eps - n_value[zero_mask] += np.finfo(float).eps + b_value[zero_mask] += np.finfo(float).eps + n_value[zero_mask] += np.finfo(float).eps relative_err = np.divide((n_value - b_value), b_value) max_relative_err = np.max(np.abs(relative_err)) if np.isnan(max_relative_err): @@ -144,7 +145,7 @@ def check_type_shape_match(npu_struct, bench_struct): shape_match = npu_shape == bench_shape type_match = npu_type == bench_type if not type_match: - if [npu_type, bench_type] in [["torch.float16", "torch.float32"], ["torch.float32", "torch.float16"], + if [npu_type, bench_type] in [["torch.float16", "torch.float32"], ["torch.float32", "torch.float16"], ["torch.float16", "torch.bfloat16"], ["torch.bfloat16", "torch.float16"]]: type_match = True else: @@ -412,8 +413,12 @@ def compare_by_op(op_name, op_name_mapping_dict, input_parma): try: n_path = os.path.join(input_parma.get("npu_dump_data_dir"), npu_bench_name_list[0] + ".npy") b_path = os.path.join(input_parma.get("bench_dump_data_dir"), npu_bench_name_list[1] + ".npy") - check_file_valid(n_path) - check_file_valid(b_path) + n_path_checker = FileChecker(n_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX) + b_path_checker = FileChecker(b_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX) + n_path = n_path_checker.common_check() + b_path = b_path_checker.common_check() n_value = np.load(n_path) b_value = np.load(b_path) except IOError as error: @@ -434,11 +439,10 @@ def compare_by_op(op_name, op_name_mapping_dict, input_parma): err_msg = " Dtype of NPU and bench Tensor do not match." else: err_msg = "" - + n_value, b_value = handle_inf_nan(n_value, b_value) if n_value is CompareConst.NAN or b_value is CompareConst.NAN: return "N/A", "N/A", "N/A", "The position of inf or nan in NPU and bench Tensor do not match." - n_value = n_value.reshape(-1).astype(float) b_value = b_value.reshape(-1).astype(float) @@ -509,6 +513,7 @@ def compare_core(input_parma, output_path, npu_pkl, bench_pkl, stack_mode=False, result_df.to_csv(fout, index=False) _do_multi_process(input_parma, file_path) + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) if auto_analyze: advisor = Advisor(file_path, output_path) advisor.analysis() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py index d92b0a145b7bd0251ad2b6ed43351ce3c64ae7e8..c58db903dc9d15c3b552067900ec1ce97c8d5f3c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -import os, sys +import os +import sys import re from ..common.utils import print_error_log, CompareException, check_compare_param from .acc_compare import compare_core diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py index 599735823e6191e37f72952b4f23341929054098..d885515892cc946750c4d65c5089ac24def325fd 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py @@ -45,9 +45,7 @@ class PrecisionDebugger: set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=input_output_mode, summary_only=summary_only) if mode == 'acl': - if not acl_config: - raise ValueError("acl_config must be configured when mode is 'acl'") - DumpUtil.dump_config = acl_config + DumpUtil.set_acl_config(acl_config) if not scope or not isinstance(scope, list) or len(scope) != 1: raise ValueError("scope must be congfigured as a list with one api name") if isinstance(scope[0], str) and 'backward' in scope[0] and not backward_input: @@ -58,9 +56,7 @@ class PrecisionDebugger: def configure_overflow_dump(self, mode="api", acl_config=None, overflow_nums=1, filter_switch = Const.OFF): if mode == "acl": DumpUtil.dump_switch_mode = mode - DumpUtil.dump_config = acl_config - if acl_config is None: - raise ValueError("acl_config must be configured when mode is 'acl'") + DumpUtil.set_acl_config(acl_config) init_overflow_nums(overflow_nums) check_switch_valid(filter_switch) OverFlowUtil.overflow_filter_switch = filter_switch @@ -103,6 +99,7 @@ class PrecisionDebugger: PrecisionDebugger.step() PrecisionDebugger.start() + def iter_tracer(func): def func_wrapper(*args, **kwargs): PrecisionDebugger.stop() diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 41e63cb0cdaeef8efe70a483e2df1fd098989cf9..5238754cce48d36e940c600d2d14d402a1cbbe43 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -18,12 +18,12 @@ import inspect import json import os -import numpy as np -import torch import threading - from pathlib import Path +import numpy as np +import torch + try: import torch_npu except ImportError: @@ -34,6 +34,7 @@ else: from .utils import DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path from ..dump.utils import check_writable +from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_vaild, check_path_length forward_init_status = False backward_init_status = False @@ -46,6 +47,7 @@ pkl_name = "" rank = os.getpid() multi_output_apis = ["_sort_", "npu_flash_attention"] + class DataInfo(object): def __init__(self, data, save_data, summary_data, dtype, shape): self.data = data @@ -134,8 +136,11 @@ def dump_data(dump_file_name, dump_step, prefix, data_info): try: if json_dump_condition(prefix): output_path = os.path.join(DumpUtil.dump_data_dir, f'{prefix}.npy') + check_path_length(output_path) + check_path_pattern_vaild(output_path) if not DumpUtil.summary_only: np.save(output_path, data_info.save_data) + change_mode(output_path, FileCheckConst.DATA_FILE_AUTHORITY) api_list.append([prefix, dump_step, [], data_info.dtype, data_info.shape, data_info.summary_data]) print_info_log(f"ptdbg is dumping rank{rank} api: {prefix}" + " " * 10, end='\r') except Exception as e: @@ -179,6 +184,7 @@ def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file): if 'output' in DumpUtil.dump_mode: dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file) + def rename_(): global rank global pkl_name @@ -188,12 +194,13 @@ def rename_(): new_name = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(rank)) else: dir_name = os.path.join(DumpUtil.dump_root, "rank{}".format(os.getpid())) - new_name = os.path.join(DumpUtil.dump_root, "rank{}".format(rank)) + new_name = os.path.join(DumpUtil.dump_root, "rank{}".format(rank)) if not os.path.exists(new_name) and os.path.exists(dir_name): _, file_name = os.path.split(pkl_name) os.rename(dir_name, new_name) pkl_name = os.path.join(new_name, file_name) + def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): dump_file = DumpUtil.get_dump_path() dump_file = modify_dump_path(dump_file, DumpUtil.dump_switch_mode) @@ -203,14 +210,14 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): global rank dump_dir, dump_filename = os.path.split(dump_file) if DumpUtil.target_iter: - dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num)) + dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num)) if not os.path.exists(dump_dir): - Path(dump_dir).mkdir(mode=0o750, exist_ok=True) + Path(dump_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) dump_file = os.path.join(dump_dir, dump_filename) rank_this = get_tensor_rank(in_feat, out_feat) DumpUtil.dump_root = os.path.dirname(DumpUtil.dump_path) if rank_this is not None and rank != rank_this: - rank = rank_this + rank = rank_this rename_() if not DumpUtil.dump_init_enable: if '.pkl' in dump_filename: @@ -225,6 +232,8 @@ def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): if rank != DumpUtil.target_rank: return dump_file = create_dirs_if_not_exist(rank, dump_file) + check_path_pattern_vaild(dump_file) + check_path_length(dump_file) global pkl_name pkl_name = dump_file if DumpUtil.dump_init_enable: @@ -341,13 +350,15 @@ def acc_cmp_dump(name, **kwargs): def write_to_disk(): global api_list if api_list: - with open(pkl_name, 'a') as f: + with FileOpen(pkl_name, 'a') as f: try: f.write('\n'.join(json.dumps(item) for item in api_list)) f.write('\n') except: raise Exception("write to disk failed") + change_mode(pkl_name, FileCheckConst.DATA_FILE_AUTHORITY) api_list = [] + def get_pkl_file_path(): return pkl_name diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index 0f65156e03f0ca17e1b6fca35061db5133a5dcd7..af00025ccae0add5618808475831715e2695a893 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -8,7 +8,8 @@ import torch from ..dump import dump from ..common.utils import print_error_log, CompareException, DumpException, Const, get_time, print_info_log, \ check_mode_valid, get_api_name_from_matcher, check_switch_valid, check_dump_mode_valid, check_summary_only_valid, generate_compare_script, \ - check_is_npu, check_file_valid, make_dump_path_if_not_exists + check_is_npu, check_file_valid, make_dump_path_if_not_exists, check_path_before_create +from ..common.file_check_util import FileChecker, FileCheckConst, check_path_length, check_path_pattern_vaild from ..common.version import __version__ @@ -42,8 +43,13 @@ class DumpUtil(object): DumpUtil.dump_init_enable = True @staticmethod - def set_dump_config(dump_config): - DumpUtil.dump_config = dump_config + def set_acl_config(acl_config): + if not acl_config: + raise ValueError("acl_config must be configured when mode is 'acl'") + acl_config_checker = FileChecker(acl_config, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.JSON_SUFFIX) + acl_config = acl_config_checker.common_check() + DumpUtil.dump_config = acl_config @staticmethod def set_dump_switch(switch, mode=None, scope=None, api_list=None, filter_switch=None, dump_mode=None, summary_only=False): @@ -66,10 +72,12 @@ class DumpUtil(object): def check_list_or_acl_mode(name_prefix): global dump_count + result = False for item in DumpUtil.dump_switch_scope: if name_prefix.startswith(item): dump_count = dump_count + 1 - return True + result = True + return result def check_range_mode(name_prefix): global range_begin_flag @@ -138,6 +146,8 @@ def set_dump_path(fpath=None, dump_tag='ptdbg_dump'): raise CompareException(CompareException.INVALID_PATH_ERROR) real_path = os.path.realpath(fpath) make_dump_path_if_not_exists(real_path) + fpath_checker = FileChecker(real_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + fpath_checker.common_check() DumpUtil.set_dump_path(real_path) DumpUtil.dump_dir_tag = dump_tag @@ -169,7 +179,7 @@ def create_dirs_if_not_exist(rank, dump_file): rank_dir = os.path.join(dump_path, f"rank{rank}") dump_file = os.path.join(rank_dir, file_name) if not os.path.isdir(rank_dir): - Path(rank_dir).mkdir(mode=0o750, exist_ok=True) + Path(rank_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) return dump_file @@ -246,6 +256,7 @@ def make_dump_data_dir(dump_file_name): dump_path, file_name = os.path.split(os.path.realpath(dump_file_name)) name_body, name_extension = os.path.splitext(file_name) output_dir = os.path.join(dump_path, f"{name_body}") + check_path_before_create(output_dir) if not os.path.exists(output_dir): Path(output_dir).mkdir(mode=0o750, exist_ok=True) else: @@ -258,6 +269,8 @@ def make_dump_dirs(): dump_file_name, dump_file_name_body = "dump.pkl", "dump" dump_root_dir = load_env_dump_path(DumpUtil.dump_path) tag_dir = os.path.join(dump_root_dir, DumpUtil.dump_dir_tag + f'_v{__version__}') + check_path_length(tag_dir) + check_path_pattern_vaild(tag_dir) Path(tag_dir).mkdir(mode=0o750, parents=True, exist_ok=True) DumpUtil.dump_dir = tag_dir dump_file_path = os.path.join(tag_dir, dump_file_name) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py index a3cb10bf4f7dfff8a9dbf8021de82391ed3ddcea..83f7dcacef0cf06f107763528df84ff291e35ccd 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py @@ -24,8 +24,10 @@ import torch.utils.hooks as full_hooks g_stop_hook = False + class HOOKModule(nn.Module): module_count = {} + def __init__(self, hook) -> None: super(HOOKModule, self).__init__() self.has_overflow = False diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py index 8a827d94d32f70906a134c3d23dd82d48bfca80a..8f67a41a40779a20116a73fbc597d631f13e6b12 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py @@ -112,7 +112,7 @@ def register_hook(model, hook, **kwargs): dump_mode, dump_config_file = init_dump_config(kwargs) if dump_mode == 'acl': DumpUtil.dump_switch_mode = dump_mode - DumpUtil.dump_config = dump_config_file + DumpUtil.set_acl_config(dump_config_file) register_hook_core(hook) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py index 3e8e33f225f2c76c30c61a97f580cb5e7f1ee84d..3b746e9c5f3bb5fd726ad3c41ae49a675b33751a 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_aten.py @@ -22,11 +22,12 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapAtenOps = yaml.safe_load(f).get('aten') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py index c96c0efe317b2340e5b6650d31d20cae669bb6e7..7aa21770cefe569f040e60cbad25604af826b627 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py @@ -22,11 +22,12 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapDistributedOps = yaml.safe_load(f).get('distributed') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py index c6e119a8bf319e12d13bb0e1f6e33ef47e1bf713..1ce938129628aefebfa7a991ce166c47159cda0d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py @@ -22,6 +22,8 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard, print_info_log +from ..common.file_check_util import FileOpen + def remove_dropout(): if torch.__version__ > "1.8": @@ -62,7 +64,7 @@ def remove_dropout(): cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapFunctionalOps = yaml.safe_load(f).get('functional') for f in dir(torch.nn.functional): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py index 4e127c87b7b1ddb170b3d89e4c0843521f93c68b..f2e1e8f9d3a3cb8132dbfc9324e3647531e4c992 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard, torch_without_guard_version +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapNpuOps = yaml.safe_load(f).get('torch_npu') @@ -47,6 +48,7 @@ class NpuOPTemplate(HOOKModule): else: return getattr(torch_npu._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs) + def wrap_npu_op(op_name, hook): def npu_op_template(*args, **kwargs): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py index f87b400dc9f77c30101cd6286e3fb89cb1282cf9..c5e3321bb8303fb4d92ff4c14b9a0dcb5df640b9 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard, parameter_adapter +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapTensorOps = yaml.safe_load(f).get('tensor') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py index e69a89d9efdeece2e05485752bb9b01b4504aa0f..903ff92c583db3169cac2b852c5bafebb521fefa 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapTorchOps = yaml.safe_load(f).get('torch') @@ -52,7 +53,7 @@ class TorchOPTemplate(HOOKModule): if item in self.op_name_: return True return False - + def einsum_adapt(self, *args): if len(args) < 2: raise ValueError('einsum(): must specify the equation string and at least one operand, ' @@ -69,7 +70,7 @@ class TorchOPTemplate(HOOKModule): return chr(ord('a') + n - 26) raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52]') equation = ','.join(''.join(parse_subscript(s) for s in l) for l in args[1::2]) - + if len(args) % 2 == 1: equation += '->' + ''.join(parse_subscript(s) for s in args[-1]) operands = args[:-1:2] diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py index b01f28fea765e947c64f160ce5f1a1e1d69fb9c5..1d055b090ab8e78cc0ddf93bed0820ea5767a6f4 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py @@ -22,10 +22,11 @@ import yaml from .hook_module import HOOKModule from ..common.utils import torch_device_guard +from ..common.file_check_util import FileOpen cur_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: +with FileOpen(yaml_path, 'r') as f: WrapVfOps = yaml.safe_load(f).get('_VF') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py index 6b3c98c33e8d8b607ff441ea0086dfab202ea15e..1b16502fb77d98ff70e8965c9934afdc8e3d63b5 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py @@ -1,10 +1,12 @@ import os import time -import yaml -import json from pathlib import Path from multiprocessing import Manager, Pool + +import yaml +import json import torch + from torch.utils._python_dispatch import TorchDispatchMode try: @@ -14,11 +16,13 @@ except ImportError: else: is_npu = True -from ..common.utils import Const, CompareConst, add_time_as_suffix, check_file_or_directory_path +from ..common.utils import Const, CompareConst, add_time_as_suffix, check_file_or_directory_path, \ + check_path_before_create from ..common.version import __version__ from .dump_compare import dispatch_workflow, dispatch_multiprocess, error_call, TimeStatistics, \ DispatchRunParam, save_csv from .utils import get_callstack, data_to_cpu, logger_debug, logger_error, logger_warn, logger_logo, get_sys_info +from ..common.file_check_util import FileOpen class PtdbgDispatch(TorchDispatchMode): @@ -52,15 +56,17 @@ class PtdbgDispatch(TorchDispatchMode): dir_name = f'ptdbg_v{__version__}_{tag}_rank{self.device_id}_{time_now}' self.root_path = os.path.join(os.path.realpath(dump_path), dir_name) self.root_cpu_path = os.path.join(self.root_path, f'cpu') - self.root_npu_path = os.path.join(self.root_path, f'npu') + self.root_npu_path = os.path.join(self.root_path, f'npu') file_name = add_time_as_suffix(f'compare_result_rank{self.device_id}') self.csv_path = os.path.join(self.root_path, file_name) + check_path_before_create(self.root_cpu_path) + check_path_before_create(self.root_npu_path) Path(self.root_cpu_path).mkdir(mode=0o750, parents=True, exist_ok=True) Path(self.root_npu_path).mkdir(mode=0o750, parents=True, exist_ok=True) self.aten_ops_blacklist = [] yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "unsupport_torch_ops.yaml") - with open(yaml_path, 'r') as f: + with FileOpen(yaml_path, 'r') as f: self.aten_ops_blacklist = yaml.safe_load(f).get('aten') self.process_num = process_num @@ -82,10 +88,10 @@ class PtdbgDispatch(TorchDispatchMode): for aten_api in api_list: if aten_api in aten_api_list: dump_api_list.append(aten_api) - else: + else: logger_warn(f'{aten_api} is not aten api will not dump, please refer to torch.ops.aten') return dump_api_list - + def get_dump_flag(self, aten_api): dump_flag = False auto_dump_flag = False @@ -107,7 +113,7 @@ class PtdbgDispatch(TorchDispatchMode): run_param.func_namespace = "aten" return True return False - + def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) @@ -131,12 +137,12 @@ class PtdbgDispatch(TorchDispatchMode): if len(json_line_data) == 0: break msg = json.loads(json_line_data) - self.all_summery[msg[0]] = msg[1] + self.all_summery[msg[0]] = msg[1] fp_handle.close() if self.debug_flag: input_num = 0 - output_num = 0 + output_num = 0 total_num = 0 for list_data in self.all_summery: @@ -149,16 +155,21 @@ class PtdbgDispatch(TorchDispatchMode): total_num = total_num + 1 logger_debug(f'Dispatch exit: Device[{self.device_id}], Pid[{os.getpid()} Input[{input_num}] ' f'Output[{output_num}] Total[{total_num}] API_Total[{self.api_index}]]') - + save_csv(self.all_summery, self.call_stack_list, self.csv_path) - def __torch_dispatch__(self, func, types, args=(), kwargs=None): + def __torch_dispatch__(self, func, types, args=(), kwargs=None): if not is_npu: logger_error("Please confirm you run environment installed torch_npu!") return func(*args, **kwargs) - aten_api = func.__name__.split(".")[0] - aten_api_overload_name = func.__name__.split(".")[1] + func_name_split_list = func.__name__.split(".") + aten_api = func_name_split_list[0] + try: + aten_api_overload_name = func_name_split_list[1] + except IndexError: + logger_error(f"Please check the func name {func.__name__}!") + return func(*args, **kwargs) if aten_api in self.aten_ops_blacklist: npu_out = func(*args, **kwargs) @@ -179,7 +190,7 @@ class PtdbgDispatch(TorchDispatchMode): run_param.aten_api = aten_api run_param.aten_api_overload_name = aten_api_overload_name run_param.single_api_index = self.single_api_index_dict[aten_api] - run_param.api_index = self.api_index + run_param.api_index = self.api_index if self.debug_flag: logger_debug(f'Dispatch Info: Rank[{self.device_id}], Pid[{os.getpid()}], Func[{func.__name__}], ' @@ -192,7 +203,7 @@ class PtdbgDispatch(TorchDispatchMode): data_to_cpu(kwargs, 0, cpu_kwargs) cpu_args = cpu_args[0] cpu_kwargs = cpu_kwargs[0] - + with TimeStatistics("NPU RUN", run_param): npu_out = func(*args, **kwargs) npu_out_cpu = [] @@ -204,7 +215,7 @@ class PtdbgDispatch(TorchDispatchMode): run_param.process_flag = False dispatch_workflow(run_param, cpu_args, cpu_kwargs, self.all_summery, func, npu_out_cpu, self.lock) else: - self.lock.acquire() + self.lock.acquire() self.all_summery.append([]) self.lock.release() run_param.process_flag = True diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py index 0dc7555f5a741578306436a07e80a809b10f67d8..2f0818ffe84c9815588233472050da9571cafaa6 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py @@ -9,6 +9,7 @@ from ..common.utils import Const, CompareConst, add_time_as_suffix from ..compare.acc_compare import cosine_similarity, get_max_abs_err, get_max_relative_err, check_accuracy from .utils import np_save_data, logger_debug, logger_error, logger_user, COLOR_RED, COLOR_GREEN, COLOR_RESET, \ CSV_COLUMN_NAME +from ..common.file_check_util import FileOpen, change_mode, FileCheckConst class DispatchRunParam: @@ -19,7 +20,7 @@ class DispatchRunParam: self.root_npu_path = root_npu_path self.root_cpu_path = root_cpu_path self.process_num = process_num - self.process_flag = None + self.process_flag = None self.func_name = None self.func_namespace = None self.aten_api = None @@ -43,11 +44,11 @@ class TimeStatistics: def __enter__(self): if self.debug: - self.time = datetime.now() + self.time = datetime.now().astimezone() def __exit__(self, exc_type, exc_val, exc_tb): if self.debug: - cost_time = datetime.now() - self.time + cost_time = datetime.now().astimezone() - self.time time_cost = f'Time[{self.tag}]: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ f'Id[{self.index}], time[{cost_time}]' hot_time_cost = "Hotspot " + time_cost @@ -66,11 +67,11 @@ def get_compare_result(npu_data, cpu_data): # Do not check dtype, there maybe type cast if npu_npy.size == 0 or cpu_npy.size == 0: return "unsupported", 0, 0, "This is empty data, can not compare." - + if npu_npy.shape != cpu_npy.shape: return CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, \ "Shape of NPU and bench Tensor do not match. Skipped." - + npu_npy = npu_npy.reshape(-1).astype(float) cpu_npy = cpu_npy.reshape(-1).astype(float) err_msg = "" @@ -78,7 +79,7 @@ def get_compare_result(npu_data, cpu_data): max_relative_err, message = get_max_relative_err(npu_npy, cpu_npy) if npu_npy.shape == 0: return "unsupported", max_abs_err, max_relative_err, "This is type of scalar data, can not compare." - + cos_sim, message = cosine_similarity(npu_npy, cpu_npy) err_msg += message @@ -149,7 +150,7 @@ def save_summery(run_param, npu_data, cpu_data, prefix, summery_list, compute_fl data_dict[CompareConst.ACCURACY] = CompareConst.ACCURACY_CHECK_YES summery_list.append(data_dict) - + if data_dict[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_NO: logger_user(f'rank{run_param.device_id} {prefix} index={run_param.single_api_index}, ' f'overload={run_param.aten_api_overload_name}, shape={data_dict[CompareConst.NPU_SHAPE]} ' @@ -191,7 +192,7 @@ def compare_data(run_param, npu_data, cpu_data, prefix, summery_list, compute_fl return True if type(npu_data) != type(cpu_data): logger_warn(f'{prefix} can not compare npu type={str(type(npu_data))} cpu type={str(type(cpu_data))}') - + return True return save_summery(run_param, npu_data, cpu_data, prefix, summery_list, compute_flag) return True @@ -200,7 +201,7 @@ def compare_data(run_param, npu_data, cpu_data, prefix, summery_list, compute_fl def save_temp_summery(api_index, single_api_summery, path, lock): summery_path = os.path.join(path, f'summery.json') lock.acquire() - with open(summery_path, "a") as f: + with FileOpen(summery_path, "a") as f: json.dump([api_index, single_api_summery], f) f.write('\n') lock.release() @@ -275,8 +276,8 @@ def save_csv(all_summery, call_stack_list, csv_path): for index, list_data in enumerate(all_summery): for data in list_data: csv_row_data = {CompareConst.NPU_NAME: data[CompareConst.NPU_NAME], - CompareConst.BENCH_NAME: data[CompareConst.BENCH_NAME], - CompareConst.NPU_DTYPE: data[CompareConst.NPU_DTYPE], + CompareConst.BENCH_NAME: data[CompareConst.BENCH_NAME], + CompareConst.NPU_DTYPE: data[CompareConst.NPU_DTYPE], CompareConst.BENCH_DTYPE: data[CompareConst.BENCH_DTYPE], CompareConst.NPU_SHAPE: data[CompareConst.NPU_SHAPE], CompareConst.BENCH_SHAPE: data[CompareConst.BENCH_SHAPE], @@ -294,5 +295,6 @@ def save_csv(all_summery, call_stack_list, csv_path): CompareConst.ERROR_MESSAGE: data[CompareConst.ERROR_MESSAGE]} row_df = pd.DataFrame.from_dict(csv_row_data, orient='index').T df = pd.concat([df, row_df]) - + df.to_csv(csv_path, index=False) + change_mode(csv_path, FileCheckConst.DATA_FILE_AUTHORITY) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py index c8a2c5d26ba431d7b96714b57e394f6357b301bc..3606dca7293782af0bcc2d79414f8c96279d2175 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py @@ -1,13 +1,14 @@ import inspect import fcntl -import json import os -import torch import threading +import json import numpy as np +import torch from ..common.utils import print_error_log +from ..common.file_check_util import FileOpen special_torch_object = ["memory_format"] @@ -169,7 +170,8 @@ class ForwardAPIInfo(APIInfo): def analyze_api_call_stack(self): stack_str = [] for (_, path, line, func, code, _) in inspect.stack()[3:]: - if not code: continue + if not code: + continue stack_line = " ".join([ "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]), " ".join(["\n", code[0].strip()])])]) @@ -205,10 +207,10 @@ def write_api_info_json(api_info): def write_json(file_path, data, indent=None): if not os.path.exists(file_path): - with open(file_path, 'w') as f: + with FileOpen(file_path, 'w') as f: f.write("{\n}") lock.acquire() - with open(file_path, 'a+') as f: + with FileOpen(file_path, 'a+') as f: fcntl.flock(f, fcntl.LOCK_EX) try: f.seek(0, os.SEEK_END) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py index 561b118e8541bafd4ffb171eb7329c94401636c6..2f18a0ece3d125f3049481ca18ea015f6c6bd9ea 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py @@ -1,12 +1,7 @@ import os -import torch from pathlib import Path -from ..common.utils import print_warn_log, get_time, print_info_log -from ..dump.dump import forward_init_status, forward_acl_dump -from .utils import OverFlowUtil, dump_overflow -from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist -from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo -from ..dump import dump + +import torch try: import torch_npu @@ -15,6 +10,13 @@ except ImportError: else: is_gpu = False +from ..common.utils import print_warn_log, get_time, print_info_log +from ..dump.dump import forward_init_status, forward_acl_dump +from .utils import OverFlowUtil, dump_overflow +from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist +from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo +from ..dump import dump + backward_init_status = False api_overflow = [] forward_api_info = {} @@ -75,6 +77,7 @@ def check_data_overflow(x): def check_path(apis, path): return any(api in path for api in apis) + def overflow_check(name, **kwargs): overflow_nums = OverFlowUtil.overflow_nums pid = kwargs.get('pid') diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py index a82a5106ffd7ef0fd49639cfccb9f2e677dc8a26..380d84cb2c5305dc7b5add4e271f9c6da20f298a 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py @@ -39,12 +39,14 @@ def catch_exception(func): def inner(*args, **kwargs): log = logging.getLogger() line = args[-1] if len(args) == 2 else "" + result = None try: - return func(*args, **kwargs) + result = func(*args, **kwargs) except OSError: log.error("%s: command not found" % line) except ParseException: log.error("Command execution failed") except SystemExit: log.warning("Please enter the correct command") + return result return inner diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py index 02cf0a215a032e5d1a7a0deddadd0e56ea39a100..c58374d044b085e1b328efb5983553ec7dd07077 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py @@ -104,6 +104,7 @@ class Util: path = self.path_strip(path) if os.path.exists(path): return + self.check_path_name(path) try: os.makedirs(path, mode=0o750) except OSError as e: @@ -221,3 +222,12 @@ class Util: else: self.log.error("The file path %s is invalid" % path) raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + + def check_path_name(self, path): + if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \ + Const.FILE_NAME_LENGTH: + self.log.error('The file path length exceeds limit.') + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): + self.log.error('The file path {} contains special characters.'.format(path)) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py index fecc8684296f8e40321d30e0902a7c1a37bfa6c2..02755e8e1fd1d52c0f952e0bfe7fd6db278ca961 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py @@ -15,36 +15,37 @@ # limitations under the License. """ -import setuptools -from pathlib import Path -import stat import os +import stat +from pathlib import Path +import setuptools VERSION = '3.0' + def generate_ptdbg_ascend_version(): - ptdbg_ascend_root = Path(__file__).parent - version_path = ptdbg_ascend_root / "ptdbg_ascend" / "common" / "version.py" - if version_path.exists(): - version_path.unlink() - flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL - modes = stat.S_IWUSR | stat.S_IRUSR - with os.fdopen(os.open(version_path, flags, modes), 'w') as f: - f.write("__version__ = '{version}'\n".format(version = VERSION)) + ptdbg_ascend_root = Path(__file__).parent + version_path = ptdbg_ascend_root / "ptdbg_ascend" / "common" / "version.py" + if version_path.exists(): + version_path.unlink() + flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL + modes = stat.S_IWUSR | stat.S_IRUSR + with os.fdopen(os.open(version_path, flags, modes), 'w') as f: + f.write("__version__ = '{version}'\n".format(version = VERSION)) generate_ptdbg_ascend_version() setuptools.setup(name='ptdbg_ascend', - version=VERSION, - description='This is a pytorch precision comparison tools', - long_description='This is a pytorch precision comparison tools, include overflow detect tool', - packages=setuptools.find_packages(), - install_requires = [ - "wheel", - "numpy", - "pandas >= 1.3.5", - "pyyaml" - ], - include_package_data=True, - ext_modules=[], - zip_safe=False) + version=VERSION, + description='This is a pytorch precision comparison tools', + long_description='This is a pytorch precision comparison tools, include overflow detect tool', + packages=setuptools.find_packages(), + install_requires = [ + "wheel", + "numpy", + "pandas >= 1.3.5", + "pyyaml" + ], + include_package_data=True, + ext_modules=[], + zip_safe=False) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py index 71980f3ace07b28ed5152ad56159cd3c5ca7b7ba..f9f31b74c0e058906056c18a8dab3709b8a68a7f 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py @@ -5,7 +5,7 @@ import os import shutil import unittest from ptdbg_ascend.advisor.advisor import Advisor -from ptdbg_ascend.common.utils import CompareException +from ptdbg_ascend.common.file_check_util import FileCheckException class TestAdvisor(unittest.TestCase): @@ -18,11 +18,11 @@ class TestAdvisor(unittest.TestCase): def test_analysis_when_csv_path_is_not_exist(self): advisor = Advisor("resources/compare/test.pkl", self.output_path) - self.assertRaises(CompareException, advisor.analysis) + self.assertRaises(FileCheckException, advisor.analysis) def test_analysis_when_csv_path_is_invalid(self): advisor = Advisor("resources/compare/npu_test_1.pkl", self.output_path) - self.assertRaises(CompareException, advisor.analysis) + self.assertRaises(FileCheckException, advisor.analysis) def test_analysis_when_csv_is_valid(self): advisor = Advisor("resources/compare/compare_result_20230703104808.csv", self.output_path) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py index ae1f3321189bd14898ca10760dfa3670d36b2a36..6adbab471fb6939c1a50b0b53a32fe9ce71a226d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py @@ -3,6 +3,7 @@ import pytest import ptdbg_ascend.common.utils as utils from ptdbg_ascend.common.utils import CompareException +from ptdbg_ascend.common.file_check_util import FileCheckException class TestUtilsMethods(unittest.TestCase): @@ -19,15 +20,15 @@ class TestUtilsMethods(unittest.TestCase): def test_check_file_or_directory_path_1(self): file = "list" - with pytest.raises(CompareException) as error: + with pytest.raises(FileCheckException) as error: utils.check_file_or_directory_path(file) - self.assertEqual(error.value.code, CompareException.INVALID_PATH_ERROR) + self.assertEqual(error.value.code, FileCheckException.INVALID_PATH_ERROR) def test_check_file_or_directory_path_2(self): file = "/list/dir" - with pytest.raises(CompareException) as error: + with pytest.raises(FileCheckException) as error: utils.check_file_or_directory_path(file) - self.assertEqual(error.value.code, CompareException.INVALID_PATH_ERROR) + self.assertEqual(error.value.code, FileCheckException.INVALID_PATH_ERROR) def test_check_file_size_1(self): file = "/list/dir" diff --git a/profiler/cluster_analyse/README.md b/profiler/cluster_analyse/README.md index 6352130c03aa215f13ba6fd972e75c5753613374..8981ac45304149f5b19d47665064952555be28c1 100644 --- a/profiler/cluster_analyse/README.md +++ b/profiler/cluster_analyse/README.md @@ -92,4 +92,4 @@ K列:Communication(Not Overlapped and Exclude Receive)指剔除recieve算 - "LOCAL"是片内拷贝,速率非常快,不需要考虑。 - “HCCS”或“PCIE”是节点内片间拷贝,速度在18GB左右或以上比较正常。 -- “RDMA”是节点间拷贝,910A速度在12GB左右或以上;910B速度在23-25GB左右比较正常。 +- “RDMA”是节点间拷贝,910A速度在12GB左右或以上。 diff --git a/profiler/cluster_analyse/analysis/communication_analysis.py b/profiler/cluster_analyse/analysis/communication_analysis.py index 1d03687298659c62823e143cf65d6455c2ca13ed..a7b27913443f0fdb15d59811bd0aedffbbf01a3a 100644 --- a/profiler/cluster_analyse/analysis/communication_analysis.py +++ b/profiler/cluster_analyse/analysis/communication_analysis.py @@ -31,7 +31,7 @@ class BaseCommAnalysis: @staticmethod def compute_ratio(dividend: float, divisor: float): - if abs(divisor) < 1e-15: + if abs(divisor) < Constant.EPS: return 0 else: return round(dividend / divisor, 4) diff --git a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py index 5f0497811662184c8bd10ef8b6fb96feae94ed7a..f83ba66edc06590e36fd5f6d8345972d203927d8 100644 --- a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py +++ b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py @@ -14,9 +14,9 @@ # limitations under the License. import os +from collections import defaultdict from common_func.constant import Constant -from collections import defaultdict from common_func.file_manager import FileManager from prof_bean.step_trace_time_bean import StepTraceTimeBean @@ -71,7 +71,7 @@ class StepTraceTimeAnalysis: return step_group_dict = {} for data_list in self.step_data_list: - stage_group = 'None' + stage_group = tuple() for stage in stage_list: if data_list[2] in stage: stage_group = tuple(stage) diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py index 3491808524f7d3a4322176b0e6071c4463c5741a..a27820983c0353c4c9e727540f0fbed933d14c3d 100644 --- a/profiler/cluster_analyse/cluster_analysis.py +++ b/profiler/cluster_analyse/cluster_analysis.py @@ -42,7 +42,6 @@ class Interface: CommunicationGroupGenerator(self.collection_path, data_map).generate() except RuntimeError: print("Can not get communication info from ranks") - finally: communication_group = {} communication_ops = [] collective_group_dict = {} @@ -59,5 +58,5 @@ class Interface: if __name__ == "__main__": parser = argparse.ArgumentParser(description="cluster analysis module") parser.add_argument('-d', '--collection_path', type=str, required=True, help="profiling data path") - args = parser.parse_args() - Interface(args).run() + args_parsed = parser.parse_args() + Interface(args_parsed).run() diff --git a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py index 8df922ea19ba9f185a59544d7000dbd07161a8cb..18fd6a23b74830fc3b2bcc69b6c2ad61413f0d0f 100644 --- a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py +++ b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py @@ -14,8 +14,9 @@ # limitations under the License. from collections import defaultdict -from common_func.file_manager import FileManager import os +from common_func.file_manager import FileManager +from common_func.path_manager import PathManager class PytorchDataPreprocessor: @@ -23,7 +24,7 @@ class PytorchDataPreprocessor: PROFILER_INFO_EXTENSION = '.json' def __init__(self, path: str): - self.path = os.path.realpath(path) + self.path = PathManager.get_realpath(path) def get_data_map(self) -> dict: ascend_pt_dirs = [] @@ -40,14 +41,22 @@ class PytorchDataPreprocessor: rank_id_map[rank_id].append(dir_name) ret_dict = dict() - for (rank_id, dir_list) in rank_id_map.items(): - dir_list.sort(key=lambda x: x.split('_')[-3]) - ret_dict[rank_id] = os.path.join(self.path, dir_list[0]) + try: + for (rank_id, dir_list) in rank_id_map.items(): + dir_list.sort(key=lambda x: x.split('_')[-3]) + ret_dict[rank_id] = os.path.join(self.path, dir_list[0]) + except Exception as e: + raise RuntimeError("Found invalid directory name!") from e return ret_dict def get_rank_id(self, dir_name: str) -> int: files = os.listdir(dir_name) for file_name in files: if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): - return int(file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)]) + rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)] + try: + rank_id = int(rank_id_str) + except ValueError: + rank_id = -1 + return rank_id return -1 diff --git a/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py b/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py index 3b2b05676d8d0f9f94fcc5f5ddd1fb9061e40f5a..49446da43df74be43700038e3ac8d4f177e5dc2e 100644 --- a/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py +++ b/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py @@ -14,16 +14,23 @@ # limitations under the License. from pathlib import Path - -import pandas as pd +import sys import argparse import re +import os +import stat +import shutil +import warnings import plotly.graph_objects as go from plotly.subplots import make_subplots from plotly.offline import plot -import os +import pandas as pd -import warnings +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from common_func.path_manager import PathManager + +MAX_READ_FILE_BYTES = 64 * 1024 * 1024 class FormDataProcessor: @@ -33,7 +40,9 @@ class FormDataProcessor: def get_files_with_prefix_recursive(self, csv_path, match_str): matched_ir_files = list(Path(csv_path).rglob(match_str)) - assert len(matched_ir_files) > 0, f"Didn't find any file in folder {csv_path} that matches {match_str}" + if not matched_ir_files: + msg = f"Didn't find any file in folder {csv_path} that matches {match_str}" + raise RuntimeError(msg) return [str(item) for item in matched_ir_files] def readSummaryData(self, columns_to_keep): @@ -42,6 +51,8 @@ class FormDataProcessor: for f in self.files: if "mindstudio_profiler_output" in f: continue + # 判断csv文件大小 + PathManager.check_path_readable(f) # 读取CSV文件 df = pd.read_csv(f) # 保留需要的列 @@ -51,20 +62,26 @@ class FormDataProcessor: print(f"{f}文件没有所需的列,请确认profiling数据的正确性:\n,以下列可能不存在{columns_to_keep}\n") continue # 从文件名提取设备ID + try: + df['device_id'] = self.getDeviceId(f) + except Exception: + print(f"文件 \"{f}\" 的路径或者是文件夹名没有按照要求,请确保存在[device_]这一级文件夹,具体操作指导见readme\n") + continue # 添加新列 "device_id" - df['device_id'] = self.getDeviceId(f) - df['node_id'] = self.getNodeId(f) - + try: + df['node_id'] = self.getNodeId(f) + except Exception: + print(f"文件 \"{f}\" 的路径或者是文件夹名没有按照要求,请确保存在[node*]这一级文件夹,具体操作指导见readme\n") + continue # 将数据添加到最终的数据框中 - - all_data = all_data.append(df, ignore_index=True) + all_data = all_data._append(df, ignore_index=True) return all_data def getChipType(self): file = self.files[0] df = pd.read_csv(file) if 'aiv_time(us)' in df.columns: - return "ASCEND_910B" + return "ASCEND_NEW" return "ASCEND_OTHER" def getDeviceId(self, dir_path): @@ -83,13 +100,13 @@ class FormDataProcessor: class ViewInfoManager: def __init__(self, chip_type): self.chip_type = chip_type - self.op_summary_columns_dict = [] + self.op_summary_columns_dict = {} self.setOpSummaryColumnsParams() def setOpSummaryColumnsParams(self): # 有些数据除了用表格的列进行分组之外,还添加了其他属性对数据进行分类,这部分数据放在extend_attr_to_group里面 self.op_summary_columns_dict = { - 'ASCEND_910B': { + 'ASCEND_NEW': { 'TimeToCsvAnalyzer': {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], 'extend_attr_to_group': ["device_id", "node_id"], @@ -123,14 +140,12 @@ class ViewInfoManager: } def getColumnsInfo(self, analyzer_type): - return self.op_summary_columns_dict[self.chip_type][analyzer_type] + return self.op_summary_columns_dict.get(self.chip_type, {}).get(analyzer_type) class OpSummaryAnalyzerBase: def __init__(self, chip_type, analyzer_type, dir_path): self.chip_type = chip_type - self.result_dir = f"{dir_path}/result" - os.makedirs(self.result_dir, exist_ok=True) # 文件路径不存在则创建 view_info = ViewInfoManager(chip_type).getColumnsInfo(analyzer_type) self.columns_to_view = view_info['columns_to_view'] self.calculate_fun = view_info['calculate_fun'] @@ -139,6 +154,13 @@ class OpSummaryAnalyzerBase: if 'extend_attr_to_group' in view_info: extend_attr_to_group = view_info['extend_attr_to_group'] self.attrs_to_group.extend(extend_attr_to_group) + # 创建结果文件 + self.result_dir = os.path.join(dir_path, "result") + PathManager.check_path_length(self.result_dir) + if os.path.exists(self.result_dir): + shutil.rmtree(self.result_dir, onerror=self.on_rm_error) + PathManager.check_path_writeable(dir_path) + PathManager.make_dir_safety(self.result_dir) def getColumnsToGroup(self): return self.columns_to_group @@ -152,6 +174,13 @@ class OpSummaryAnalyzerBase: view_data = summary_data.groupby(self.attrs_to_group).agg(calculate_dict).reset_index() return view_data + def on_rm_error(self, func, path, exc_info): + # path contains the path of the file that couldn't be removed + # let's just assume that it's read-only and unlink it. + os.chmod(path, stat.S_IWRITE) + os.unlink(path) + + class TimeToCsvAnalyzer(OpSummaryAnalyzerBase): def __init__(self, chip_type, dir_path): super().__init__(chip_type, "TimeToCsvAnalyzer", dir_path) @@ -160,9 +189,16 @@ class TimeToCsvAnalyzer(OpSummaryAnalyzerBase): view_data = self.calculateViewData(summary_data) # 规范化列名 view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns] - for column in self.columns_to_view: - view_data[column + '_range'] = view_data[column + '_max'] - view_data[column + '_min'] - view_data.to_csv(self.result_dir + "/cluster_duration_time_analysis.csv", index=False) + try: + for column in self.columns_to_view: + view_data[column + '_range'] = view_data[column + '_max'] - view_data[column + '_min'] + except Exception as e: + raise RuntimeError("Invalid view data!") from e + save_path = os.path.join(self.result_dir, "cluster_duration_time_analysis.csv") + PathManager.check_path_length(save_path) + view_data.to_csv(save_path, index=False) + # 该文件权限设置为只读权限,不允许修改 + os.chmod(save_path, stat.S_IROTH) return view_data @@ -210,7 +246,11 @@ class StatisticalInfoToHtmlAnalyzer(OpSummaryAnalyzerBase): height=int(500 * row_num), width=int(rank_num * 100 * col_num), title_text="Op Performance Comparison") - plot(fig, filename=self.result_dir + "/" + column + "_Info.html") + save_plot_path = os.path.join(self.result_dir, column + "_Info.html") + PathManager.check_path_length(save_plot_path) + plot(fig, filename=save_plot_path) + # 该文件权限设置为只读权限,不允许修改 + os.chmod(save_plot_path, stat.S_IROTH) def getCalNum(self, rank_num): # 计算每行应该画多少个子图 @@ -219,6 +259,7 @@ class StatisticalInfoToHtmlAnalyzer(OpSummaryAnalyzerBase): else: return 1 + class DeliverableGenerator: def __init__(self, args): self.args = args @@ -230,18 +271,28 @@ class DeliverableGenerator: def run(self): summary_data = self.formProcess.readSummaryData(self.columns_to_keep) + # 判断summarydata 数据是否为空,如果是空, 说明所有csv读取数据都失败了 + if summary_data.empty: + print("没有符合要求的csv表格数据,请排查您的PROFILING数据") + return rank_num = self.formProcess.getRankNum() for analyzer in self.analyzers: analyzer.GenerateDeliverable(summary_data, rank_num) def setAnalyzers(self, args): chip_type = self.formProcess.getChipType() + # 判断该路径是不是软链接,并修改为绝对路径 + if os.path.islink(args.dir): + print(f"The file: \"{args.dir}\" is link. Please check the path.") + return + prof_path = os.path.realpath(args.dir) + PathManager.input_path_common_check(prof_path) if args.type == "all": - self.analyzers = [TimeToCsvAnalyzer(chip_type, args.dir), StatisticalInfoToHtmlAnalyzer(chip_type, args.top_n, args.dir)] + self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path), StatisticalInfoToHtmlAnalyzer(chip_type, args.top_n, prof_path)] elif args.type == "html": - self.analyzers = [StatisticalInfoToHtmlAnalyzer(chip_type, args.top_n, args.dir)] + self.analyzers = [StatisticalInfoToHtmlAnalyzer(chip_type, args.top_n, prof_path)] elif args.type == "csv": - self.analyzers = [TimeToCsvAnalyzer(chip_type, args.dir)] + self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path)] else: warnings.warn("参数错误,请输入 all html csv 这三种类型") # 发出一个警告信息 @@ -255,15 +306,15 @@ class DeliverableGenerator: def main(): - # 解析命令行参数 - parser = argparse.ArgumentParser() - parser.add_argument("--dir", "-d", default=None, help="root dir of PROF_* data") - parser.add_argument("--top_n", "-n", default=10, help="how many operators to show", type=int) - parser.add_argument("--type", "-t", default='html', help="compare ratio or aicore-time", type=str) - args = parser.parse_args() - - deviverable_gen = DeliverableGenerator(args) - deviverable_gen.run() + # 解析命令行参数 + parser = argparse.ArgumentParser() + parser.add_argument("--dir", "-d", default=None, help="root dir of PROF_* data") + parser.add_argument("--top_n", "-n", default=10, help="how many operators to show", type=int) + parser.add_argument("--type", "-t", default='html', help="compare ratio or aicore-time", type=str) + args = parser.parse_args() + + deviverable_gen = DeliverableGenerator(args) + deviverable_gen.run() if __name__ == "__main__": main() diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py index 8910099c7f845219fb9ec12e0dd1e916d007b2a1..091007cf56be2dc0b96f0ef4a620ce604e861600 100644 --- a/profiler/cluster_analyse/common_func/constant.py +++ b/profiler/cluster_analyse/common_func/constant.py @@ -66,3 +66,10 @@ class Constant(object): # step time RANK = 'rank' STAGE = 'stage' + + # epsilon + EPS = 1e-15 + + # file suffix + JSON_SUFFIX = ".json" + CSV_SUFFIX = ".csv" diff --git a/profiler/cluster_analyse/common_func/file_manager.py b/profiler/cluster_analyse/common_func/file_manager.py index 063332acc0942bfe60314cb6e646bb89ff3e00fe..8fa988bd2957940363df4fa85583746071fa8104 100644 --- a/profiler/cluster_analyse/common_func/file_manager.py +++ b/profiler/cluster_analyse/common_func/file_manager.py @@ -22,64 +22,64 @@ from common_func.path_manager import PathManager class FileManager: + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 @classmethod def read_csv_file(cls, file_path: str, class_bean: any) -> list: PathManager.check_path_readable(file_path) + base_name = os.path.basename(file_path) file_size = os.path.getsize(file_path) if file_size <= 0: return [] if file_size > Constant.MAX_CSV_SIZE: - check_msg = input( - f"The file({file_path}) size exceeds the preset max value, do you continue reading the file? [y/n]") - if check_msg.lower() != "y": - print(f"[WARNING] The user choose not to read the file: {file_path}") - return [] + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") result_data = [] try: with open(file_path, newline="") as csv_file: reader = csv.DictReader(csv_file) for row in reader: result_data.append(class_bean(row)) - except Exception: - raise RuntimeError(f"Failed to read the file: {file_path}") + except Exception as e: + raise RuntimeError(f"Failed to read the file: {base_name}") from e return result_data @classmethod def read_json_file(cls, file_path: str) -> dict: PathManager.check_path_readable(file_path) + base_name = os.path.basename(file_path) file_size = os.path.getsize(file_path) if file_size <= 0: return {} if file_size > Constant.MAX_JSON_SIZE: - check_msg = input( - f"The file({file_path}) size exceeds the preset max value, do you continue reading the file? [y/n]") - if check_msg.lower() != "y": - print(f"[WARNING] The user choose not to read the file: {file_path}") - return [] + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") try: with open(file_path, "r") as json_file: result_data = json.load(json_file) - except Exception: - raise RuntimeError(f"Failed to read the file: {file_path}") + except Exception as e: + raise RuntimeError(f"Failed to read the file: {base_name}") from e return result_data @classmethod def create_csv_file(cls, profiler_path: str, data: list, file_name: str, headers: list = None) -> None: if not data: return - output_path = os.path.join(profiler_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + output_path = os.path.join( + profiler_path, Constant.CLUSTER_ANALYSIS_OUTPUT) output_file = os.path.join(output_path, file_name) - PathManager.create_file_safety(output_file) - PathManager.check_path_writeable(output_file) + base_name = os.path.basename(output_file) + PathManager.check_path_writeable(output_path) try: - with open(output_file, "w", newline="") as file: + with os.fdopen( + os.open(output_file, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY), + 'w', newline="" + ) as file: writer = csv.writer(file) if headers: writer.writerow(headers) writer.writerows(data) - except Exception: - raise RuntimeError(f"Can't create file: {output_file}") + except Exception as e: + raise RuntimeError(f"Can't create file: {base_name}") from e @classmethod def create_json_file(cls, profiler_path: str, data: dict, file_name: str) -> None: @@ -87,16 +87,31 @@ class FileManager: return output_path = os.path.join(profiler_path, Constant.CLUSTER_ANALYSIS_OUTPUT) output_file = os.path.join(output_path, file_name) - PathManager.create_file_safety(output_file) - PathManager.check_path_writeable(output_file) + base_name = os.path.basename(output_file) + PathManager.check_path_writeable(output_path) try: - with open(output_file, "w") as file: + with os.fdopen( + os.open(output_file, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY), 'w' + ) as file: json.dump(data, file) - except Exception: - raise RuntimeError(f"Can't create the file: {output_file}") + except Exception as e: + raise RuntimeError(f"Can't create the file: {base_name}") from e @classmethod def create_output_dir(cls, collection_path: str) -> None: - output_path = os.path.join(collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + output_path = os.path.join( + collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) PathManager.remove_path_safety(output_path) PathManager.make_dir_safety(output_path) + + @classmethod + def check_file_size(cls, file_path): + suffix = os.path.splitext(file_path) + base_name = os.path.join(file_path) + if suffix == Constant.CSV_SUFFIX: + limit_size = Constant.MAX_JSON_SIZE + else: + limit_size = Constant.MAX_CSV_SIZE + file_size = os.path.getsize(file_path) + if file_size > limit_size: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") diff --git a/profiler/cluster_analyse/common_func/path_manager.py b/profiler/cluster_analyse/common_func/path_manager.py index 3a332d43a0fe16f0d9ef76cffa010cf95d316b7d..7ef7b4c345c024a0980c6ce2d91839b64c351740 100644 --- a/profiler/cluster_analyse/common_func/path_manager.py +++ b/profiler/cluster_analyse/common_func/path_manager.py @@ -38,9 +38,9 @@ class PathManager: when invalid data throw exception """ cls.input_path_common_check(path) - + base_name = os.path.basename(path) if os.path.isfile(path): - msg = "Invalid input path which is a file path: {path}" + msg = f"Invalid input path which is a file path: {base_name}" raise RuntimeError(msg) @classmethod @@ -55,18 +55,29 @@ class PathManager: when invalid data throw exception """ cls.input_path_common_check(path) - + base_name = os.path.basename(path) if os.path.isdir(path): - msg = "Invalid input path which is a directory path: {path}" + msg = f"Invalid input path which is a directory path: {base_name}" raise RuntimeError(msg) + @classmethod + def check_path_length(cls, path: str): + if len(path) > cls.MAX_PATH_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + path_split_list = path.split("/") + for path in path_split_list: + path_list = path.split("\\") + for name in path_list: + if len(name) > cls.MAX_FILE_NAME_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + @classmethod def input_path_common_check(cls, path: str): if len(path) > cls.MAX_PATH_LENGTH: raise RuntimeError("Length of input path exceeds the limit.") if os.path.islink(path): - msg = f"Invalid input path which is a soft link: {path}" + msg = f"Invalid input path which is a soft link." raise RuntimeError(msg) if platform.system().lower() == cls.WINDOWS: @@ -74,7 +85,7 @@ class PathManager: else: pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z])+' if not re.fullmatch(pattern, path): - msg = f"Invalid input path: {path}" + msg = f"Invalid input path." raise RuntimeError(msg) path_split_list = path.split("/") @@ -94,8 +105,9 @@ class PathManager: Exception Description: when invalid path, prompt the user """ + base_name = os.path.basename(path) if not os.path.exists(path): - msg = f"The path does not exist: {path}" + msg = f"Invalid path: {base_name}" raise RuntimeError(msg) if platform.system().lower() == cls.WINDOWS: return @@ -116,10 +128,11 @@ class PathManager: """ cls.check_path_owner_consistent(path) if os.path.islink(path): - msg = f"Invalid path which is a soft link: {path}" + msg = f"Invalid path which is a soft link." raise RuntimeError(msg) + base_name = os.path.basename(path) if not os.access(path, os.W_OK): - msg = f"The path permission check failed: {path}" + msg = f"The path permission check failed: {base_name}" raise RuntimeError(msg) @classmethod @@ -134,15 +147,17 @@ class PathManager: """ cls.check_path_owner_consistent(path) if os.path.islink(path): - msg = f"Invalid path which is a soft link: {path}" + msg = f"Invalid path which is a soft link." raise RuntimeError(msg) + base_name = os.path.basename(path) if not os.access(path, os.R_OK): - msg = f"The path permission check failed: {path}" + msg = f"The path permission check failed: {base_name}" raise RuntimeError(msg) @classmethod def remove_path_safety(cls, path: str): - msg = f"Failed to remove path: {path}" + base_name = os.path.basename(path) + msg = f"Failed to remove path: {base_name}" if os.path.islink(path): raise RuntimeError(msg) if os.path.exists(path): @@ -153,7 +168,8 @@ class PathManager: @classmethod def make_dir_safety(cls, path: str): - msg = f"Failed to make directory: {path}" + base_name = os.path.basename(path) + msg = f"Failed to make directory: {base_name}" if os.path.islink(path): raise RuntimeError(msg) if os.path.exists(path): @@ -165,7 +181,8 @@ class PathManager: @classmethod def create_file_safety(cls, path: str): - msg = f"Failed to create file: {path}" + base_name = os.path.basename(path) + msg = f"Failed to create file: {base_name}" if os.path.islink(path): raise RuntimeError(msg) if os.path.exists(path): @@ -178,6 +195,6 @@ class PathManager: @classmethod def get_realpath(cls, path: str) -> str: if os.path.islink(path): - msg = f"Invalid input path which is a soft link: {path}" + msg = f"Invalid input path which is a soft link." raise RuntimeError(msg) return os.path.realpath(path) diff --git a/profiler/cluster_analyse/communication_group/communication_group_generator.py b/profiler/cluster_analyse/communication_group/communication_group_generator.py index a367e624fb9575bc4f29d732a59b33a57ab678bb..6611b6406c7511ca4ebc38f1c3bc31f9750f3e74 100644 --- a/profiler/cluster_analyse/communication_group/communication_group_generator.py +++ b/profiler/cluster_analyse/communication_group/communication_group_generator.py @@ -15,9 +15,9 @@ import os from copy import deepcopy +from collections import defaultdict from common_func.constant import Constant from common_func.file_manager import FileManager -from collections import defaultdict class CommunicationGroupGenerator: @@ -119,14 +119,20 @@ class CommunicationGroupGenerator: return p2p_ops = ops.get(Constant.P2P, {}) for op_name, link_dict in p2p_ops.items(): - for link in link_dict: - src_rank = int(link.split('-')[0]) - dst_rank = int(link.split('-')[1]) - if src_rank != dst_rank: - rank_set = set([src_rank, dst_rank]) - if rank_set in self.p2p_link: - continue - self.p2p_link.append(rank_set) + self.append_p2p_link(op_name, link_dict) + + def append_p2p_link(self, op_name, link_dict): + for link in link_dict: + if '-' not in link: + print(f"{op_name} has an invalid link key {link}!") + break + src_rank = int(link.split('-')[0]) + dst_rank = int(link.split('-')[1]) + if src_rank != dst_rank: + rank_set = set([src_rank, dst_rank]) + if rank_set in self.p2p_link: + continue + self.p2p_link.append(rank_set) def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict): for comm_op in comm_op_dict: diff --git a/profiler/compare_tools/comparator/op_comparator.py b/profiler/compare_tools/comparator/op_comparator.py index cb4b5bfa899ead63b995bacec5a85fa7d52575a3..8ccd428ef82250266722bed0dfd59a97c6dc39c0 100644 --- a/profiler/compare_tools/comparator/op_comparator.py +++ b/profiler/compare_tools/comparator/op_comparator.py @@ -34,8 +34,8 @@ class OpComparator: def _matching_op(cls, base_ops: list, comparison_ops: list, name_func: any) -> list: if not comparison_ops: result_data = [None] * len(base_ops) - for index in range(len(base_ops)): - result_data[index] = [base_ops[index], None] + for index, value in enumerate(base_ops): + result_data[index] = [value, None] return result_data result_data = [] diff --git a/profiler/compare_tools/generation/communication_comparison_generator.py b/profiler/compare_tools/generation/communication_comparison_generator.py index 7838c45aec59e6b794ec26d12f65336a22ee681c..8e0f260892f7c9493659e50e5a2badfc314c2d4c 100644 --- a/profiler/compare_tools/generation/communication_comparison_generator.py +++ b/profiler/compare_tools/generation/communication_comparison_generator.py @@ -14,26 +14,29 @@ class CommunicationComparisonGenerator: self._args = args self._args_manager = ArgsManager() self._compare_result_data = compare_result_data + self._row_index = 3 def create_sheet(self, workbook: Workbook): ws = workbook.create_sheet("CommunicationCompare", 0) ws.sheet_properties.tabColor = Constant.YELLOW_COLOR - # write headers base_headers = Constant.CMP_COMMUNICATION_HEADER comparison_headers = Constant.CMP_COMMUNICATION_HEADER headers = base_headers + comparison_headers + [Constant.DIFF] + self.set_header(ws, base_headers, comparison_headers, headers) + self.write_lines(ws, base_headers, comparison_headers, headers) + + def set_header(self, ws, base_headers, comparison_headers, headers): + # write headers base_trace_start_column = 0 comparison_trace_start_column = len(base_headers) diff_start_column = len(base_headers) + len(comparison_headers) - - for col_index in range(len(headers)): + for col_index, header_name in enumerate(headers): ws.cell(row=1, column=col_index + 1).border = Constant.BORDER ws.cell(row=1, column=col_index + 1).font = Font(name='Arial') ws.cell(row=1, column=col_index + 1).fill = Constant.HEADERS_FILL ws.cell(row=2, column=col_index + 1).border = Constant.BORDER ws.cell(row=2, column=col_index + 1).font = Font(name='Arial', bold=True) ws.cell(row=2, column=col_index + 1).fill = Constant.HEADERS_FILL - header_name = headers[col_index] if col_index < comparison_trace_start_column: ws.cell(row=1, column=col_index + 1).value = Constant.BASE_PROFILING elif col_index < diff_start_column: @@ -50,92 +53,98 @@ class CommunicationComparisonGenerator: ws.merge_cells(start_row=1, start_column=headers.index(Constant.DIFF) + 1, end_row=2, end_column=headers.index(Constant.DIFF) + 1) + def write_lines(self, ws, base_headers, comparison_headers, headers): # write lines - row_index = 3 + self._row_index = 3 for _, row in self._compare_result_data.iterrows(): - # write summary lines - base_name = Constant.NA if math.isnan(row[Constant.BASE_CALLS]) else row[Constant.OP_KEY] - comparison_name = Constant.NA if math.isnan(row[Constant.COMPARISON_CALLS]) else row[Constant.OP_KEY] - if math.isnan(row[Constant.BASE_SUM]) or math.isnan(row[Constant.COMPARISON_SUM]) or row[ - Constant.BASE_SUM] == 0: - diff = Constant.NA + self.write_summary_lines(ws, row, headers) + self._row_index += 1 + self.write_detail_lines(ws, row, base_headers, comparison_headers, headers) + + def write_summary_lines(self, ws, row, headers): + # write summary lines + base_name = Constant.NA if math.isnan(row[Constant.BASE_CALLS]) else row[Constant.OP_KEY] + comparison_name = Constant.NA if math.isnan(row[Constant.COMPARISON_CALLS]) else row[Constant.OP_KEY] + if math.isnan(row[Constant.BASE_SUM]) or math.isnan(row[Constant.COMPARISON_SUM]) or row[ + Constant.BASE_SUM] == 0: + diff = Constant.NA + else: + diff = (row[Constant.COMPARISON_SUM] - row[Constant.BASE_SUM]) / row[Constant.BASE_SUM] + row_data = [base_name, Constant.NA, row[Constant.BASE_CALLS], row[Constant.BASE_SUM], + row[Constant.BASE_AVG], row[Constant.BASE_MAX], row[Constant.BASE_MIN], comparison_name, + Constant.NA, row[Constant.COMPARISON_CALLS], row[Constant.COMPARISON_SUM], + row[Constant.COMPARISON_AVG], row[Constant.COMPARISON_MAX], row[Constant.COMPARISON_MIN], diff] + for index, header_name in enumerate(headers): + if header_name in ( + Constant.CALLS, Constant.TOTAL_DURATION, Constant.AVG_DURATION, Constant.MAX_DURATION, + Constant.MIN_DURATION): + ws.cell(row=self._row_index, column=index + 1).number_format = '0.00' + if header_name == Constant.DIFF: + ws.cell(row=self._row_index, column=index + 1).number_format = '0.00%' + if diff != Constant.NA and diff < 0: + ws.cell(row=self._row_index, column=index + 1).font = Font(name='Arial', + color=Constant.GREEN_COLOR) + elif diff != Constant.NA and diff >= 0: + ws.cell(row=self._row_index, column=index + 1).font = Font(name='Arial', color=Constant.RED_COLOR) else: - diff = (row[Constant.COMPARISON_SUM] - row[Constant.BASE_SUM]) / row[Constant.BASE_SUM] - row_data = [base_name, Constant.NA, row[Constant.BASE_CALLS], row[Constant.BASE_SUM], - row[Constant.BASE_AVG], row[Constant.BASE_MAX], row[Constant.BASE_MIN], comparison_name, - Constant.NA, row[Constant.COMPARISON_CALLS], row[Constant.COMPARISON_SUM], - row[Constant.COMPARISON_AVG], row[Constant.COMPARISON_MAX], row[Constant.COMPARISON_MIN], diff] - for index in range(len(headers)): - if headers[index] in ( + bold = header_name == Constant.COMMUNICAT_OP + ws.cell(row=self._row_index, column=index + 1).font = Font(name='Arial', bold=bold) + value = row_data[index] + if value != Constant.NA: + ws.cell(row=self._row_index, column=index + 1).value = value + ws.cell(row=self._row_index, column=index + 1).border = Constant.BORDER + ws.cell(row=self._row_index, column=index + 1).fill = PatternFill("solid", + fgColor=Constant.SUMMARY_LINE_COLOR) + + def write_detail_lines(self, ws, row, base_headers, comparison_headers, headers): + # write detail lines + base_name = Constant.NA if math.isnan(row[Constant.BASE_CALLS]) else row[Constant.OP_KEY] + comparison_name = Constant.NA if math.isnan(row[Constant.COMPARISON_CALLS]) else row[Constant.OP_KEY] + base_task_list = self._args_manager.base_profiling.communication_task_data.get(base_name, []) + comparison_task_list = self._args_manager.comparison_profiling.communication_task_data.get(comparison_name, []) + if base_task_list: + base_data = [[data.get("name", ""), float(data.get("dur", 0))] for data in base_task_list] + base_df = pd.DataFrame(base_data, columns=[Constant.OP_KEY, Constant.DEVICE_DUR]) + base_data = base_df.groupby(Constant.OP_KEY).agg( + ["count", "sum", "mean", "max", "min"]).reset_index().values.tolist() + else: + base_data = [] + if comparison_task_list: + comparison_data = [[data.get("name", ""), float(data.get("dur", 0))] for data in comparison_task_list] + comparison_df = pd.DataFrame(comparison_data, columns=[Constant.OP_KEY, Constant.DEVICE_DUR]) + comparison_data = comparison_df.groupby(Constant.OP_KEY).agg( + ["count", "sum", "mean", "max", "min"]).reset_index().values.tolist() + else: + comparison_data = [] + for index in range(max(len(base_data), len(comparison_data))): + base_detail_data, comparison_detail_data = [Constant.NA] * len(base_headers), [Constant.NA] * len(comparison_headers) + base_detail_data[0] = "|" + comparison_detail_data[0] = "|" + if index < len(base_data): + total_dur = sum([data[2] for data in base_data]) + percent = 0.0 if abs(total_dur) < Constant.EPS else base_data[index][2] / total_dur + dur_percent = "%.2f%%" % (percent * 100) + base_data[index][0] = f"{base_data[index][0]} ({dur_percent})" + base_detail_data[1:] = base_data[index] + if index < len(comparison_data): + total_dur = sum([data[2] for data in comparison_data]) + percent = 0.0 if abs(total_dur) < Constant.EPS else comparison_data[index][2] / total_dur + dur_percent = "%.2f%%" % (percent * 100) + comparison_data[index][0] = f"{comparison_data[index][0]} ({dur_percent})" + comparison_detail_data[1:] = comparison_data[index] + + detail_data = base_detail_data + comparison_detail_data + [Constant.NA] + for colum_index in range(len(headers)): + if headers[colum_index] in ( Constant.CALLS, Constant.TOTAL_DURATION, Constant.AVG_DURATION, Constant.MAX_DURATION, Constant.MIN_DURATION): - ws.cell(row=row_index, column=index + 1).number_format = '0.00' - if headers[index] == Constant.DIFF: - ws.cell(row=row_index, column=index + 1).number_format = '0.00%' - if diff != Constant.NA and diff < 0: - ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', - color=Constant.GREEN_COLOR) - elif diff != Constant.NA and diff >= 0: - ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', color=Constant.RED_COLOR) - else: - bold = headers[index] == Constant.COMMUNICAT_OP - ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', bold=bold) - value = row_data[index] + ws.cell(row=self._row_index, column=colum_index + 1) .number_format = '0.00' + value = detail_data[colum_index] if value != Constant.NA: - ws.cell(row=row_index, column=index + 1).value = value - ws.cell(row=row_index, column=index + 1).border = Constant.BORDER - ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", - fgColor=Constant.SUMMARY_LINE_COLOR) - row_index += 1 - - # write detail lines - base_task_list = self._args_manager.base_profiling.communication_task_data.get(base_name, []) - comparison_task_list = self._args_manager.comparison_profiling.communication_task_data.get(comparison_name, - []) - if base_task_list: - base_data = [[data.get("name", ""), float(data.get("dur", 0))] for data in base_task_list] - base_df = pd.DataFrame(base_data, columns=[Constant.OP_KEY, Constant.DEVICE_DUR]) - base_data = base_df.groupby(Constant.OP_KEY).agg( - ["count", "sum", "mean", "max", "min"]).reset_index().values.tolist() - else: - base_data = [] - if comparison_task_list: - comparison_data = [[data.get("name", ""), float(data.get("dur", 0))] for data in comparison_task_list] - comparison_df = pd.DataFrame(comparison_data, columns=[Constant.OP_KEY, Constant.DEVICE_DUR]) - comparison_data = comparison_df.groupby(Constant.OP_KEY).agg( - ["count", "sum", "mean", "max", "min"]).reset_index().values.tolist() - else: - comparison_data = [] - - for index in range(max(len(base_data), len(comparison_data))): - base_detail_data, comparison_detail_data = [Constant.NA] * len(base_headers), \ - [Constant.NA] * len(comparison_headers) - base_detail_data[0] = "|" - comparison_detail_data[0] = "|" - if index < len(base_data): - total_dur = sum([data[2] for data in base_data]) - dur_percent = "%.2f%%" % (base_data[index][2] / total_dur * 100) - base_data[index][0] = f"{base_data[index][0]} ({dur_percent})" - base_detail_data[1:] = base_data[index] - if index < len(comparison_data): - total_dur = sum([data[2] for data in comparison_data]) - dur_percent = "%.2f%%" % (comparison_data[index][2] / total_dur * 100) - comparison_data[index][0] = f"{comparison_data[index][0]} ({dur_percent})" - comparison_detail_data[1:] = comparison_data[index] - - detail_data = base_detail_data + comparison_detail_data + [Constant.NA] - for colum_index in range(len(headers)): - if headers[colum_index] in ( - Constant.CALLS, Constant.TOTAL_DURATION, Constant.AVG_DURATION, Constant.MAX_DURATION, - Constant.MIN_DURATION): - ws.cell(row=row_index, column=colum_index + 1) .number_format = '0.00' - value = detail_data[colum_index] - if value != Constant.NA: - ws.cell(row=row_index, column=colum_index + 1).value = value - bold = headers[colum_index] == Constant.OP_NAME - ws.cell(row=row_index, column=colum_index + 1).font = Font(name='Arial', bold=bold) - ws.cell(row=row_index, column=colum_index + 1).border = Constant.BORDER - if headers[colum_index] == Constant.COMMUNICAT_OP: - ws.cell(row=row_index, column=colum_index + 1).alignment = Alignment(horizontal="center", - vertical="center") - row_index += 1 + ws.cell(row=self._row_index, column=colum_index + 1).value = value + bold = headers[colum_index] == Constant.OP_NAME + ws.cell(row=self._row_index, column=colum_index + 1).font = Font(name='Arial', bold=bold) + ws.cell(row=self._row_index, column=colum_index + 1).border = Constant.BORDER + if headers[colum_index] == Constant.COMMUNICAT_OP: + ws.cell(row=self._row_index, column=colum_index + 1).alignment = Alignment(horizontal="center", vertical="center") + self._row_index += 1 diff --git a/profiler/compare_tools/generation/op_comparison_generator.py b/profiler/compare_tools/generation/op_comparison_generator.py index 1e48d7d0b4443bb64ff2706e3bb14526f9a875a0..5d1d658f724f0ef6a05eaccab514f826befe88b9 100644 --- a/profiler/compare_tools/generation/op_comparison_generator.py +++ b/profiler/compare_tools/generation/op_comparison_generator.py @@ -1,4 +1,5 @@ import copy +from collections import namedtuple from openpyxl.styles import Font, PatternFill, Alignment from openpyxl.workbook import Workbook @@ -15,6 +16,7 @@ class OpComparisonGenerator: self._compare_type = compare_type self._base_headers = [] self._comparison_headers = [] + self._row_index = 3 self.update_headers() def update_headers(self): @@ -32,10 +34,23 @@ class OpComparisonGenerator: def create_sheet(self, workbook: Workbook): ws = workbook.create_sheet(self._compare_type, 0) ws.sheet_properties.tabColor = Constant.YELLOW_COLOR - # write headers headers = self._base_headers + self._comparison_headers + [Constant.DIFF, Constant.OP_NAME_FILTER, Constant.DIFF_FILTER] + self.writer_headers(ws, headers) + # write lines + self._row_index = 3 + for data in self._compare_result_data: + base_event_list = TreeBuilder.get_total_compare_event(data[0], self._compare_type) if data[0] else [] + comparison_event_list = TreeBuilder.get_total_compare_event(data[1], self._compare_type) if data[1] else [] + diff = self.write_summary_lines(ws, headers, data, base_event_list, comparison_event_list) + self._row_index += 1 + EventListWrapper = namedtuple('EventListWrapper', ['base_event_list', 'comparison_event_list']) + event_list = EventListWrapper(base_event_list, comparison_event_list) + self.write_detail_lines(ws, headers, data, diff, event_list) + + def writer_headers(self, ws, headers): + # write headers base_trace_start_column = 0 comparison_trace_start_column = len(self._base_headers) diff_start_column = len(self._base_headers) + len(self._comparison_headers) @@ -70,90 +85,100 @@ class OpComparisonGenerator: ws.merge_cells(start_row=1, start_column=headers.index(Constant.DIFF_FILTER) + 1, end_row=2, end_column=headers.index(Constant.DIFF_FILTER) + 1) - # write lines - row_index = 3 - for data in self._compare_result_data: - # write summary lines - base_event_list = TreeBuilder.get_total_compare_event(data[0], self._compare_type) if data[0] else [] - comparison_event_list = TreeBuilder.get_total_compare_event(data[1], self._compare_type) if data[1] else [] - base_summary_data, comparison_summary_data = [Constant.NA] * len(self._base_headers), \ - [Constant.NA] * len(self._comparison_headers) - if data[0]: - base_summary_data[0] = data[0].name - base_summary_data[1] = data[0].input_shape - base_summary_data[2] = data[0].input_type - base_summary_data[3] = sum( - [x.compare_index for x in base_event_list]) if base_event_list else Constant.NA - if data[1]: - comparison_summary_data[0] = data[1].name - comparison_summary_data[1] = data[1].input_shape - comparison_summary_data[2] = data[1].input_type - comparison_summary_data[3] = sum( - [x.compare_index for x in comparison_event_list]) if comparison_event_list else Constant.NA - if base_event_list and comparison_event_list and base_summary_data[3]: - diff = (comparison_summary_data[3] - base_summary_data[3]) / base_summary_data[3] - else: - diff = Constant.NA - op_name = data[0].name if data[0] else data[1].name + def write_summary_lines(self, ws, headers, data, base_event_list, comparison_event_list): + def ws_write_diff(ws, index, value): + ws.cell(row=self._row_index, column=index + 1).number_format = '0.00%' + if value != Constant.NA and value < 0: + ws.cell(row=self._row_index, column=index + 1).font = Font(name='Arial', color=Constant.GREEN_COLOR) + elif value != Constant.NA and value >= 0: + ws.cell(row=self._row_index, column=index + 1).font = Font(name='Arial', color=Constant.RED_COLOR) + + def ws_write_diff_filter(ws, index, diff_value): + if diff_value != Constant.NA and diff_value < 0: + ws.cell(row=self._row_index, column=index + 1).fill = PatternFill("solid", + fgColor=Constant.GREEN_COLOR) + elif diff_value != Constant.NA and diff_value >= 0: + ws.cell(row=self._row_index, column=index + 1).fill = PatternFill("solid", fgColor=Constant.RED_COLOR) + # write summary lines + base_summary_data, comparison_summary_data = [Constant.NA] * len(self._base_headers), \ + [Constant.NA] * len(self._comparison_headers) + if data[0]: + base_summary_data[0] = data[0].name + base_summary_data[1] = data[0].input_shape + base_summary_data[2] = data[0].input_type + base_summary_data[3] = sum( + [x.compare_index for x in base_event_list]) if base_event_list else Constant.NA + if data[1]: + comparison_summary_data[0] = data[1].name + comparison_summary_data[1] = data[1].input_shape + comparison_summary_data[2] = data[1].input_type + comparison_summary_data[3] = sum( + [x.compare_index for x in comparison_event_list]) if comparison_event_list else Constant.NA + if base_event_list and comparison_event_list and base_summary_data[3]: + diff = (comparison_summary_data[3] - base_summary_data[3]) / base_summary_data[3] + else: + diff = Constant.NA + op_name = data[0].name if data[0] else data[1].name + + summary_data = base_summary_data + comparison_summary_data + [diff, op_name, Constant.NA] + if len(summary_data) < len(headers): + raise RuntimeError("Fail to write summary lines!") + for index, header_name in enumerate(headers): + value = summary_data[index] + if header_name == Constant.DIFF: + ws_write_diff(ws, index, value) + if header_name == Constant.DIFF_FILTER: + diff_value = summary_data[headers.index(Constant.DIFF)] + ws_write_diff_filter(ws, index, diff_value) + elif header_name != Constant.OP_NAME_FILTER: + ws.cell(row=self._row_index, column=index + 1).fill = PatternFill("solid", + fgColor=Constant.SUMMARY_LINE_COLOR) - summary_data = base_summary_data + comparison_summary_data + [diff, op_name, Constant.NA] - for index in range(len(headers)): - value = summary_data[index] - if headers[index] == Constant.DIFF: - ws.cell(row=row_index, column=index + 1).number_format = '0.00%' - if value != Constant.NA and value < 0: - ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', color=Constant.GREEN_COLOR) - elif value != Constant.NA and value >= 0: - ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', color=Constant.RED_COLOR) - if headers[index] == Constant.DIFF_FILTER: - diff_value = summary_data[headers.index(Constant.DIFF)] - if diff_value != Constant.NA and diff_value < 0: - ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", - fgColor=Constant.GREEN_COLOR) - elif diff_value != Constant.NA and diff_value >= 0: - ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", fgColor=Constant.RED_COLOR) - elif headers[index] != Constant.OP_NAME_FILTER: - ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", - fgColor=Constant.SUMMARY_LINE_COLOR) + if value != Constant.NA: + ws.cell(row=self._row_index, column=index + 1).value = value + bold = header_name == Constant.OP_NAME + if header_name != Constant.DIFF: + ws.cell(row=self._row_index, column=index + 1).font = Font(name='Arial', bold=bold) + ws.cell(row=self._row_index, column=index + 1).border = Constant.BORDER + return diff - if value != Constant.NA: - ws.cell(row=row_index, column=index + 1).value = value - bold = headers[index] == Constant.OP_NAME - if headers[index] != Constant.DIFF: - ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', bold=bold) - ws.cell(row=row_index, column=index + 1).border = Constant.BORDER - row_index += 1 + def write_detail_lines(self, ws, headers, data, diff, event_list): + def ws_write_helper(ws, colum_index, value, diff, headers): + if value != Constant.NA: + ws.cell(row=self._row_index, column=colum_index + 1).value = value + bold = headers[colum_index] == Constant.OP_NAME + ws.cell(row=self._row_index, column=colum_index + 1).font = Font(name='Arial', bold=bold) + ws.cell(row=self._row_index, column=colum_index + 1).border = Constant.BORDER + if headers[colum_index] == Constant.DIFF_FILTER: + if diff != Constant.NA and diff < 0: + ws.cell(row=self._row_index, column=colum_index + 1).fill = PatternFill("solid", + fgColor=Constant.GREEN_COLOR) + elif diff != Constant.NA and diff >= 0: + ws.cell(row=self._row_index, column=colum_index + 1).fill = PatternFill("solid", + fgColor=Constant.RED_COLOR) + if headers[colum_index] == Constant.OP_NAME: + ws.cell(row=self._row_index, column=colum_index + 1).alignment = Alignment(horizontal="center", + vertical="center") - # write detail lines - base_event_num, comparison_event_num = len(base_event_list), len(comparison_event_list) - for index in range(max(base_event_num, comparison_event_num)): - base_detail_data, comparison_detail_data = [Constant.NA] * len(self._base_headers), \ - [Constant.NA] * len(self._comparison_headers) - base_detail_data[0] = "|" - comparison_detail_data[0] = "|" - if index < base_event_num: - base_event = base_event_list[index] - base_detail_data[1:] = base_event.get_record() - if index < comparison_event_num: - comparison_event = comparison_event_list[index] - comparison_detail_data[1:] = comparison_event.get_record() + base_event_list = event_list.base_event_list + comparison_event_list = event_list.comparison_event_list + # write detail lines + op_name = data[0].name if data[0] else data[1].name + base_event_num, comparison_event_num = len(base_event_list), len(comparison_event_list) + for index in range(max(base_event_num, comparison_event_num)): + base_detail_data, comparison_detail_data = [Constant.NA] * len(self._base_headers), \ + [Constant.NA] * len(self._comparison_headers) + base_detail_data[0] = "|" + comparison_detail_data[0] = "|" + if index < base_event_num: + base_event = base_event_list[index] + base_detail_data[1:] = base_event.get_record() + if index < comparison_event_num: + comparison_event = comparison_event_list[index] + comparison_detail_data[1:] = comparison_event.get_record() - detail_data = base_detail_data + comparison_detail_data + [Constant.NA, op_name, Constant.NA] - for colum_index in range(len(headers)): - value = detail_data[colum_index] - if value != Constant.NA: - ws.cell(row=row_index, column=colum_index + 1).value = value - bold = headers[colum_index] == Constant.OP_NAME - ws.cell(row=row_index, column=colum_index + 1).font = Font(name='Arial', bold=bold) - ws.cell(row=row_index, column=colum_index + 1).border = Constant.BORDER - if headers[colum_index] == Constant.DIFF_FILTER: - if diff != Constant.NA and diff < 0: - ws.cell(row=row_index, column=colum_index + 1).fill = PatternFill("solid", - fgColor=Constant.GREEN_COLOR) - elif diff != Constant.NA and diff >= 0: - ws.cell(row=row_index, column=colum_index + 1).fill = PatternFill("solid", - fgColor=Constant.RED_COLOR) - if headers[colum_index] == Constant.OP_NAME: - ws.cell(row=row_index, column=colum_index + 1).alignment = Alignment(horizontal="center", - vertical="center") - row_index += 1 + detail_data = base_detail_data + comparison_detail_data + [Constant.NA, op_name, Constant.NA] + for colum_index in range(len(headers)): + value = detail_data[colum_index] + ws_write_helper(ws, colum_index, value, diff, headers) + self._row_index += 1 diff --git a/profiler/compare_tools/performance_compare.py b/profiler/compare_tools/performance_compare.py index 2d98fce4be613ea5eedfd4ebb57cf19dc10d837d..5bcfada0bb5fcc14d927f7387c9954160aed7b24 100644 --- a/profiler/compare_tools/performance_compare.py +++ b/profiler/compare_tools/performance_compare.py @@ -6,10 +6,12 @@ import sys import time sys.path.append( - os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cluster_analyse", "common_func")) + os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cluster_analyse")) + from generation.comparison_generator import ComparisonGenerator from utils.args_manager import ArgsManager from profiling_analysis.profiling_parse import prof_main +from common_func.path_manager import PathManager def performance_compare(args): @@ -46,7 +48,7 @@ def main(): dir_path = args.output_path if args.output_path else "./" file_name = "performance_comparison_result_{}.xlsx".format( time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) - result_file_path = os.path.realpath(os.path.join(dir_path, file_name)) + result_file_path = PathManager.get_realpath(os.path.join(dir_path, file_name)) ComparisonGenerator(args).create_excel(result_file_path) print(f"[INFO] The comparison result file has been generated: {result_file_path}") diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py index b7999257894ea336cf2cf857a05a17a67cc17419..4443562bd4edae71d30c0314ea22756a8d20b534 100644 --- a/profiler/compare_tools/profiling_analysis/gpu_parser.py +++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py @@ -13,11 +13,33 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from collections import Counter, defaultdict import pandas as pd import profiling_analysis.parser_helper as parser_helper from utils.file_reader import FileReader +from utils.constant import Constant + + +class OpTimeWarper: + def __init__( + self, + cube_time: float = 0.0, + fa_time_fwd: float = 0.0, + fa_time_bwd: float = 0.0, + all_op_time: float = 0.0, + compute_stream_dur: float = 0.0, + cube_num: int = 0, + vec_num: int = 0 + ): + self.cube_time = cube_time + self.fa_time_fwd = fa_time_fwd + self.fa_time_bwd = fa_time_bwd + self.all_op_time = all_op_time + self.compute_stream_dur = compute_stream_dur + self.cube_num = cube_num + self.vec_num = vec_num class GpuProfilingParser: @@ -37,17 +59,14 @@ class GpuProfilingParser: return True return False - def parse_events(self): + def update_op_list(self, op_list, marks): cube_time = 0.0 all_op_time = 0.0 fa_time_bwd = 0.0 fa_time_fwd = 0.0 cube_num = 0 vec_num = 0 - op_list = [] compute_stream_dur = 0.0 - marks = defaultdict(int) # mark for compute communication_not_overlapped time - for event in self.trace_events: if not isinstance(event, dict): continue @@ -80,8 +99,30 @@ class GpuProfilingParser: vec_num += 1 all_op_time += float(dur) op_list.append([ts, name, cat, dur]) - op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur']) - op_dataframe.to_csv('gpu_perf.csv', index=False) + time_wrapper = OpTimeWarper( + cube_time=cube_time, + fa_time_fwd=fa_time_fwd, + fa_time_bwd=fa_time_bwd, + all_op_time=all_op_time, + compute_stream_dur=compute_stream_dur, + cube_num=cube_num, + vec_num=vec_num + ) + return time_wrapper + + def parse_events(self): + op_list = [] + marks = defaultdict(int) # mark for compute communication_not_overlapped time + + time_wrapper = self.update_op_list(op_list, marks) + cube_time = time_wrapper.cube_time + fa_time_fwd = time_wrapper.fa_time_fwd + fa_time_bwd = time_wrapper.fa_time_bwd + all_op_time = time_wrapper.all_op_time + compute_stream_dur = time_wrapper.compute_stream_dur + cube_num = time_wrapper.cube_num + vec_num = time_wrapper.vec_num + self.profiling_info.compute_time = len([_ for _, value in marks.items() if value < 0]) / 10 ** 6 self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6 @@ -94,7 +135,10 @@ class GpuProfilingParser: self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \ self.profiling_info.communication_not_overlapped - self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time + if self.profiling_info.e2e_time < Constant.EPS: + self.profiling_info.scheduling_ratio = 0.0 + else: + self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time self.parse_memory_reserved() def parse_e2e_time(self): diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py index 73eeb1fd00bb5b748fec4d49a9d03944ae399dca..2c71b0dc4a5e10f5f40f70da618dd017cfc461f7 100644 --- a/profiler/compare_tools/profiling_analysis/npu_parser.py +++ b/profiler/compare_tools/profiling_analysis/npu_parser.py @@ -14,10 +14,32 @@ # limitations under the License. import sys -import pandas as pd from collections import defaultdict +import pandas as pd import profiling_analysis.parser_helper as parser_helper from utils.file_reader import FileReader +from common_func.path_manager import PathManager +from common_func.file_manager import FileManager + + +class NpuInfoWrapper: + def __init__( + self, + compute_time: int, + communication_time: int, + is_cluster: bool, + event_wait_sqe: dict, + ai_core_dict: dict, + event_wait_sqe_res: dict, + ai_core_res: dict, + ): + self.compute_time = compute_time + self.communication_time = communication_time + self.is_cluster = is_cluster + self.event_wait_sqe = event_wait_sqe + self.ai_core_dict = ai_core_dict + self.event_wait_sqe_res = event_wait_sqe_res + self.ai_core_res = ai_core_res class NpuProfilingParser: @@ -65,7 +87,19 @@ class NpuProfilingParser: communication_time += dur min_ts = ts if ts < min_ts else min_ts max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts + npu_info_wrapper = NpuInfoWrapper( + compute_time, communication_time, is_cluster, + event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res) + self.update_npu_info(max_ts - min_ts, npu_info_wrapper) + def update_npu_info(self, ts_dur, npu_info_wrapper): + compute_time = npu_info_wrapper.compute_time + communication_time = npu_info_wrapper.communication_time + is_cluster = npu_info_wrapper.is_cluster + event_wait_sqe = npu_info_wrapper.event_wait_sqe + ai_core_dict = npu_info_wrapper.ai_core_dict + event_wait_sqe_res = npu_info_wrapper.event_wait_sqe_res + ai_core_res = npu_info_wrapper.ai_core_res # AI_CORE和EVENT_WAIT_SQE共存为计算流 compute_stream = [] parallel_stream = [] @@ -87,7 +121,7 @@ class NpuProfilingParser: self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) self.profiling_info.compute_time = compute_time / 10 ** 6 if is_cluster else \ ai_core_res[compute_stream[0]] / 10 ** 6 - self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6 if is_cluster else \ + self.profiling_info.e2e_time = ts_dur / 10 ** 6 if is_cluster else \ (self.max_stream_ts - self.min_stream_ts) / 10 ** 6 self.profiling_info.communication_not_overlapped = communication_time / 10 ** 6 \ if is_cluster else (event_wait_sqe_res[compute_stream[0]] - self.parallel_time) / 10 ** 6 @@ -116,6 +150,8 @@ class NpuProfilingParser: if not self.npu_summary_file: print('[WARNING] Npu kernel details csv file is not available.') return + PathManager.check_path_readable(self.npu_summary_file) + FileManager.check_file_size(self.npu_summary_file) info = pd.read_csv(self.npu_summary_file, index_col=None) cube_time = 0.0 vec_time = 0.0 @@ -155,6 +191,8 @@ class NpuProfilingParser: print('[INFO] Npu op memory csv file is not available.') return try: + PathManager.check_path_readable(self.npu_mem_file) + FileManager.check_file_size(self.npu_mem_file) info = pd.read_csv(self.npu_mem_file, usecols=['Total Reserved(MB)'], index_col=None) except ValueError: print('[ERROR] Load memory info failed.') diff --git a/profiler/compare_tools/utils/args_manager.py b/profiler/compare_tools/utils/args_manager.py index 48bcdd76997b4b25ad80dc4dcc7b438b0b7a8c56..632333b7c1168a274facdab5dfe3892639a543f4 100644 --- a/profiler/compare_tools/utils/args_manager.py +++ b/profiler/compare_tools/utils/args_manager.py @@ -1,6 +1,6 @@ import os.path -from path_manager import PathManager +from common_func.path_manager import PathManager from utils.constant import Constant from utils.file_reader import FileReader from utils.profiling_parser import GPUProfilingParser, NPUProfilingParser diff --git a/profiler/compare_tools/utils/constant.py b/profiler/compare_tools/utils/constant.py index 360c2ab44ae8f56c1708bb2c8213c357445ffcb4..828dc758014a397e8329fc0658141272c160322d 100644 --- a/profiler/compare_tools/utils/constant.py +++ b/profiler/compare_tools/utils/constant.py @@ -15,6 +15,9 @@ class Constant(object): RED_COLOR = "00FF0000" SUMMARY_LINE_COLOR = "F0F8FF" + # epsilon + EPS = 1e-15 + # autority FILE_AUTHORITY = 0o640 DIR_AUTHORITY = 0o750 diff --git a/profiler/compare_tools/utils/file_reader.py b/profiler/compare_tools/utils/file_reader.py index 37853f41f4231217f8d8ece3b1dbc10fd7934b4e..ef0287b35f862ca5bd807de498cc8684256d7c43 100644 --- a/profiler/compare_tools/utils/file_reader.py +++ b/profiler/compare_tools/utils/file_reader.py @@ -2,7 +2,7 @@ import csv import json import os -from path_manager import PathManager +from common_func.path_manager import PathManager from utils.constant import Constant @@ -19,16 +19,16 @@ class FileReader: return [] if file_size > Constant.MAX_FILE_SIZE: check_msg = input( - f"The file({file_path}) size exceeds the preset max value, do you continue reading the file? [y/n]") + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") if check_msg.lower() != "y": print(f"[WARNING] The user choose not to read the file: {file_path}") return [] try: with open(file_path, "rt") as file: json_data = json.loads(file.read()) - except Exception: + except Exception as e: msg = f"Can't read file: {file_path}" - raise RuntimeError(msg) + raise RuntimeError(msg) from e return json_data @classmethod @@ -41,7 +41,7 @@ class FileReader: return [] if file_size > Constant.MAX_FILE_SIZE: check_msg = input( - f"The file({file_path}) size exceeds the preset max value, do you continue reading the file? [y/n]") + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") if check_msg.lower() != "y": print(f"[WARNING] The user choose not to read the file: {file_path}") return [] @@ -51,9 +51,9 @@ class FileReader: reader = csv.DictReader(csv_file) for row in reader: result_data.append(row) - except Exception: + except Exception as e: msg = f"Failed to read the file: {file_path}" - raise RuntimeError(msg) + raise RuntimeError(msg) from e return result_data @classmethod diff --git a/profiler/compare_tools/utils/profiling_parser.py b/profiler/compare_tools/utils/profiling_parser.py index b49217b844bba89d97287fd6df91d4feac668255..a94887ecc2f6a2b6069d031f0cfada2537f8cf46 100644 --- a/profiler/compare_tools/utils/profiling_parser.py +++ b/profiler/compare_tools/utils/profiling_parser.py @@ -1,4 +1,4 @@ -from abc import ABCMeta, abstractmethod +from abc import abstractmethod from math import ceil from utils.compare_event import KernelEvent @@ -7,31 +7,16 @@ from utils.file_reader import FileReader from utils.trace_event_data import TraceEventData -class ProfilingParser(metaclass=ABCMeta): - @abstractmethod - def get_torch_op_data(self): - raise NotImplementedError - - @abstractmethod - def get_kernel_dict(self): - raise NotImplementedError - - @abstractmethod - def get_memory_list(self): - raise NotImplementedError - - -class GPUProfilingParser(ProfilingParser): +class ProfilingParser: def __init__(self, args: any, path_dict: dict): self._args = args self._profiling_path = path_dict.get(Constant.PROFILING_PATH) - self._json_path = path_dict.get(Constant.PROFILING_PATH) self._torch_op_data = None self._kernel_dict = None self._memory_list = None self._communication_data = None self._communication_task_data = None - + @property def file_path(self) -> str: return self._profiling_path @@ -70,6 +55,24 @@ class GPUProfilingParser(ProfilingParser): self.get_communication_data() return self._communication_task_data + @abstractmethod + def get_torch_op_data(self): + raise NotImplementedError + + @abstractmethod + def get_kernel_dict(self): + raise NotImplementedError + + @abstractmethod + def get_memory_list(self): + raise NotImplementedError + + +class GPUProfilingParser(ProfilingParser): + def __init__(self, args: any, path_dict: dict): + super().__init__(args, path_dict) + self._json_path = path_dict.get(Constant.PROFILING_PATH) + def get_torch_op_data(self): torch_op_list = [] json_data = FileReader.read_trace_file(self._json_path) @@ -144,53 +147,9 @@ class GPUProfilingParser(ProfilingParser): class NPUProfilingParser(ProfilingParser): def __init__(self, args: any, path_dict: str): - self._args = args - self._profiling_path = path_dict.get(Constant.PROFILING_PATH) + super().__init__(args, path_dict) self._json_path = path_dict.get(Constant.TRACE_PATH) self._memory_data_path = path_dict.get(Constant.MEMORY_DATA_PATH) - self._torch_op_data = None - self._kernel_dict = None - self._memory_list = None - self._communication_data = None - self._communication_task_data = None - - @property - def file_path(self) -> str: - return self._profiling_path - - @property - def json_path(self) -> str: - return self._json_path - - @property - def torch_op_data(self) -> list: - if self._torch_op_data is None: - self.get_torch_op_data() - return self._torch_op_data - - @property - def kernel_dict(self) -> dict: - if self._kernel_dict is None: - self.get_kernel_dict() - return self._kernel_dict - - @property - def memory_list(self) -> list: - if self._memory_list is None: - self.get_memory_list() - return self._memory_list - - @property - def communication_data(self) -> dict: - if self._communication_data is None: - self.get_communication_data() - return self._communication_data - - @property - def communication_task_data(self) -> dict: - if self._communication_task_data is None: - self.get_communication_data() - return self._communication_task_data def get_torch_op_data(self): torch_op_list = [] @@ -246,7 +205,7 @@ class NPUProfilingParser(ProfilingParser): match_dequeue_data = self._match_cann_memory_data(dequeue_data, ts_time) if match_dequeue_data is not None: correlation_id = match_dequeue_data.get("args", {}).get("correlation_id", "") - ts = enqueue_dict[correlation_id].get("ts", 0) + ts = enqueue_dict.get(correlation_id, {}).get("ts", 0) self._memory_list.append({Constant.SIZE: float(data.get(Constant.SIZE, 0)), Constant.TS: ts, Constant.NAME: data.get(Constant.NAME, ""), Constant.ALLOCATION_TIME: float(data.get(Constant.ALLOCATION_TIME, 0)), @@ -272,55 +231,70 @@ class NPUProfilingParser(ProfilingParser): return dequeue_data[left] if end_time > ts_time else None def get_communication_data(self): + def get_pid(json_data): + pid = None + for data in json_data: + trace_event = TraceEventData(data) + if not trace_event.is_process_meta(): + continue + if trace_event.is_hccl_process(): + pid = trace_event.pid + break + return pid + + def get_tid_list(pid, tid_list, json_data): + for data in json_data: + trace_event = TraceEventData(data) + if not trace_event.is_thread_meta(): + continue + if trace_event.pid != pid: + continue + if trace_event.is_communication_op_thread(): + tid_list.append(trace_event.tid) + + def get_comm_data(pid, tid_list, json_data): + for data in json_data: + trace_event = TraceEventData(data) + if not trace_event.is_x_mode(): + continue + if trace_event.pid != pid: + continue + if trace_event.tid in tid_list: + self._communication_data.append(data) + + def get_comm_task_data(pid, tid_list, json_data): + for data in json_data: + trace_event = TraceEventData(data) + if not trace_event.is_x_mode(): + continue + if trace_event.pid != pid: + continue + if trace_event.tid in tid_list: + continue + ts = trace_event.start_time + for communication_op in self._communication_data: + comm_op_event = TraceEventData(communication_op) + if ts < comm_op_event.start_time or ts > comm_op_event.end_time: + continue + name_list = communication_op.get("name", "").split("_") + if len(name_list) >= 2: + self._communication_task_data.setdefault(name_list[1].lower(), []).append(data) + break + self._communication_data, self._communication_task_data = [], {} json_data = FileReader.read_trace_file(self._json_path) - pid = None - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_process_meta(): - continue - if trace_event.is_hccl_process(): - pid = trace_event.pid - break + + pid = get_pid(json_data) if pid is None: return - tid_list = [] - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_thread_meta(): - continue - if trace_event.pid != pid: - continue - if trace_event.is_communication_op_thread(): - tid_list.append(trace_event.tid) + tid_list = [] + get_tid_list(pid, tid_list, json_data) if not tid_list: return - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_x_mode(): - continue - if trace_event.pid != pid: - continue - if trace_event.tid in tid_list: - self._communication_data.append(data) + get_comm_data(pid, tid_list, json_data) if not self._communication_data: return - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_x_mode(): - continue - if trace_event.pid != pid: - continue - if trace_event.tid in tid_list: - continue - ts = trace_event.start_time - for communication_op in self._communication_data: - comm_op_event = TraceEventData(communication_op) - if ts < comm_op_event.start_time or ts > comm_op_event.end_time: - continue - name_list = communication_op.get("name", "").split("_") - if len(name_list) >= 2: - self._communication_task_data.setdefault(name_list[1].lower(), []).append(data) - break + + get_comm_task_data(pid, tid_list, json_data) diff --git a/profiler/compare_tools/utils/tree_builder.py b/profiler/compare_tools/utils/tree_builder.py index 6d765eb0d70b2c58e14c52ff66c09598d4ef2061..b08aa6b9703e7b3cfce8db413ea6330659300cd5 100644 --- a/profiler/compare_tools/utils/tree_builder.py +++ b/profiler/compare_tools/utils/tree_builder.py @@ -1,4 +1,5 @@ from queue import Queue +from typing import Optional, Dict, List from utils.constant import Constant from utils.torch_op_node import TorchOpNode @@ -21,7 +22,25 @@ class TreeBuilder: return root_node @classmethod - def update_tree_node(cls, root_node: TorchOpNode, flow_kernel_dict: dict = {}, memory_allocated_list: list = []): + def update_tree_node( + cls, + root_node: TorchOpNode, + flow_kernel_dict: Optional[Dict] = None, + memory_allocated_list: Optional[List] = None, + ): + def set_kernel_helper(node_queue, ts, kernel_num, kernel_list): + while not node_queue.empty(): + tree_node = node_queue.get() + tree_node.add_kernel_num(kernel_num) + matched_child_node = tree_node.match_child_node(ts) + if matched_child_node: + node_queue.put(matched_child_node) + else: + tree_node.set_kernel_list(kernel_list) + + flow_kernel_dict = flow_kernel_dict if flow_kernel_dict else {} + memory_allocated_list = memory_allocated_list if memory_allocated_list else [] + if flow_kernel_dict: for ts, kernel_list in flow_kernel_dict.items(): matched_child_node = root_node.match_child_node(ts) @@ -30,14 +49,8 @@ class TreeBuilder: kernel_num = len(kernel_list) node_queue = Queue() node_queue.put(matched_child_node) - while not node_queue.empty(): - tree_node = node_queue.get() - tree_node.add_kernel_num(kernel_num) - matched_child_node = tree_node.match_child_node(ts) - if matched_child_node: - node_queue.put(matched_child_node) - else: - tree_node.set_kernel_list(kernel_list) + set_kernel_helper(node_queue, ts, kernel_num, kernel_list) + for memory_allocated in memory_allocated_list: ts = memory_allocated.get(Constant.TS) matched_child_node = root_node.match_child_node(ts)