diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index e8b5814e11edc9903e9237f4060e0b2a1da2cf67..a17288c87fffc763eb193e46ab170354bb35b6af 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -149,6 +149,10 @@ class Const: MODULE_PREFIX = ["Module", "Cell"] FORWARD_NAME_SUFFIX = ".forward" + DUMP_JSON_FILE = "dump_json_file" + DEBUG_JSON_FILE = "debug_json_file" + STACK_JSON_FILE = "stack_json_file" + # struct json param ORIGIN_DATA = "origin_data" SCOPE = "scope" @@ -236,6 +240,8 @@ class Const: MEAN = 'Mean' NORM = 'Norm' DATA_NAME = 'data_name' + STATE = 'state' + API_ORIGIN_NAME = 'api_origin_name' TENSOR_STAT_INDEX = 'tensor_stat_index' SUMMARY_METRICS_LIST = [MAX, MIN, MEAN, NORM] @@ -391,6 +397,7 @@ class CompareConst: Class for compare module const """ SPACE = " " + NAME = "Name" # compare result column name NPU_NAME = "NPU Name" BENCH_NAME = "Bench Name" @@ -434,10 +441,11 @@ class CompareConst: OUTPUT_STRUCT = "output_struct" PARAMS_STRUCT = "params_struct" PARAMS_GRAD_STRUCT = "params_grad_struct" + DEBUG_STRUCT = "debug_struct" SUMMARY = "summary" COMPARE_RESULT = "compare_result" COMPARE_MESSAGE = "compare_message" - MAX_EXCEL_LENGTH = 1048576 + MAX_EXCEL_LENGTH = 1048500 YES = "Yes" NO = "No" STATISTICS_INDICATOR_NUM = 4 @@ -540,7 +548,8 @@ class CompareConst: Const.KWARGS: INPUT_STRUCT, Const.OUTPUT: OUTPUT_STRUCT, Const.PARAMS: PARAMS_STRUCT, - Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT + Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT, + Const.DEBUG: DEBUG_STRUCT } # compare standard @@ -643,9 +652,9 @@ class CompareConst: OP_NAME_X = 'op_name_x' MATCH_RESULT_COLUMNS = [ - OP_NAME_X, 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x', + OP_NAME_X, 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'state_x', 'api_origin_name_x', 'data_name_x', CMP_KEY, CMP_SHAPE, - 'op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'data_name_y', + 'op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'state_y', 'api_origin_name_y', 'data_name_y' ] INTERNAL_API_MAPPING_FILE = 'ms_to_pt_api.yaml' diff --git a/debug/accuracy_tools/msprobe/core/common/utils.py b/debug/accuracy_tools/msprobe/core/common/utils.py index 0ed64c34c4e4c48b13af4f1509ab995cdf533af0..f4ab06070a01364fcebb132e567f2c6392891f83 100644 --- a/debug/accuracy_tools/msprobe/core/common/utils.py +++ b/debug/accuracy_tools/msprobe/core/common/utils.py @@ -27,10 +27,15 @@ from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_pa from msprobe.core.common.const import Const, CompareConst from msprobe.core.common.log import logger from msprobe.core.common.exceptions import MsprobeException +from msprobe.core.common.decorator import recursion_depth_decorator device = collections.namedtuple('device', ['type', 'index']) prefixes = ['api_stack', 'list', 'range', 'acl'] +file_suffix_to_file_type = { + "dump.json": Const.DUMP_JSON_FILE, + "debug.json": Const.DEBUG_JSON_FILE, +} class MsprobeBaseException(Exception): @@ -151,9 +156,10 @@ def check_compare_param(input_param, output_path, dump_mode, stack_mode): _check_json(stack_json, input_param.get("stack_json_path")) -def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False, is_print_compare_log=True): - arg_list = [stack_mode, auto_analyze, fuzzy_match, is_print_compare_log] - arg_names = ['stack_mode', 'auto_analyze', 'fuzzy_match', 'is_print_compare_log'] +def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False, highlight=False, + is_print_compare_log=True): + arg_list = [stack_mode, auto_analyze, fuzzy_match, highlight, is_print_compare_log] + arg_names = ['stack_mode', 'auto_analyze', 'fuzzy_match', 'highlight', 'is_print_compare_log'] for arg, name in zip(arg_list, arg_names): if not isinstance(arg, bool): logger.error(f"Invalid input parameter, {name} which should be only bool type.") @@ -219,17 +225,33 @@ def format_value(value): return float('{:.12f}'.format(value)) -def md5_find(data): - for key_op in data: - for api_info in data[key_op]: - if isinstance(data[key_op][api_info], list): - for data_detail in data[key_op][api_info]: - if data_detail and 'md5' in data_detail: - return True - if isinstance(data[key_op][api_info], bool): - continue - elif data[key_op][api_info] and 'md5' in data[key_op][api_info]: +@recursion_depth_decorator('msprobe.core.common.utils.md5_find', max_depth=Const.DUMP_MAX_DEPTH) +def md5_find(data, json_type=Const.DUMP_JSON_FILE): + if json_type == Const.DUMP_JSON_FILE: + for key_op in data: + for api_info in data[key_op]: + if isinstance(data[key_op][api_info], list): + for data_detail in data[key_op][api_info]: + if data_detail and Const.MD5 in data_detail: + return True + if isinstance(data[key_op][api_info], bool): + continue + elif data[key_op][api_info] and Const.MD5 in data[key_op][api_info]: + return True + elif json_type == Const.DEBUG_JSON_FILE: + if isinstance(data, dict): + if Const.MD5 in data: return True + else: + for _, data_info in data.items(): + if md5_find(data_info, Const.DEBUG_JSON_FILE): + return True + elif isinstance(data, list): + for data_info in data: + if md5_find(data_info, Const.DEBUG_JSON_FILE): + return True + else: + return False return False @@ -267,15 +289,28 @@ def get_stack_construct_by_dump_json_path(dump_json_path): def set_dump_path(input_param): npu_path = input_param.get("npu_json_path", None) bench_path = input_param.get("bench_json_path", None) - npu_path_valid = npu_path is not None and npu_path.endswith("dump.json") - bench_path_valid = bench_path is not None and bench_path.endswith("dump.json") - if not npu_path_valid or not bench_path_valid: + dump_json_path_valid = npu_path is not None and npu_path.endswith("dump.json") and \ + bench_path is not None and bench_path.endswith("dump.json") + debug_json_path_valid = npu_path is not None and npu_path.endswith("debug.json") and \ + bench_path is not None and bench_path.endswith("debug.json") + if not dump_json_path_valid and not debug_json_path_valid: logger.error(f"Please check the json path is valid and ensure that neither npu_path nor bench_path is None.") raise CompareException(CompareException.INVALID_PATH_ERROR) input_param[CompareConst.NPU_DUMP_DATA_DIR] = os.path.join(os.path.dirname(npu_path), Const.DUMP_TENSOR_DATA) input_param[CompareConst.BENCH_DUMP_DATA_DIR] = os.path.join(os.path.dirname(bench_path), Const.DUMP_TENSOR_DATA) +def get_file_type(file_path): + if not isinstance(file_path, str): + logger.error("get_file_type failed, check the type of file_path.") + raise CompareException(CompareException.INVALID_PATH_ERROR) + file_type = file_suffix_to_file_type.get(file_path.split(Const.SCOPE_SEPARATOR)[-1]) + if file_type is None: + logger.error("get_file_type failed, file_path is neither dump.json nor debug.json.") + raise CompareException(CompareException.INVALID_PATH_ERROR) + return file_type + + def check_dump_json_key(json_data, device_type): task = json_data.get('task', None) if not task: @@ -296,6 +331,7 @@ def get_dump_mode(input_param): bench_path = input_param.get("bench_json_path", None) npu_json_data = load_json(npu_path) bench_json_data = load_json(bench_path) + json_type = get_file_type(file_path=npu_path) npu_task, npu_api_data = check_dump_json_key(npu_json_data, 'npu') bench_task, bench_api_data = check_dump_json_key(bench_json_data, 'bench') @@ -311,8 +347,8 @@ def get_dump_mode(input_param): return Const.STRUCTURE if npu_task == Const.STATISTICS: - npu_md5_compare = md5_find(npu_api_data) - bench_md5_compare = md5_find(bench_api_data) + npu_md5_compare = md5_find(npu_api_data, json_type) + bench_md5_compare = md5_find(bench_api_data, json_type) if npu_md5_compare == bench_md5_compare: return Const.MD5 if npu_md5_compare else Const.SUMMARY else: @@ -429,10 +465,10 @@ def get_real_step_or_rank(step_or_rank_input, obj): def check_init_step(step): if not is_int(step): raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, - f"{step} must be an integer") + f"{step} must be an integer") if not step >= 0: raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, - f"{step} must be greater than or equal to 0") + f"{step} must be greater than or equal to 0") def check_token_range(token_range): diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 59c4b42ee2ef9d3fcd0b654f0472e0c49e7d6113..af9a518ff4f0d38fb336defcf84e3d023bfb2daa 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -25,10 +25,11 @@ from tqdm import tqdm from msprobe.core.advisor.advisor import Advisor from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import load_json, remove_path, create_directory, save_json +from msprobe.core.common.file_utils import load_json, remove_path, create_directory, save_excel, save_json from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, add_time_with_xlsx, check_op_str_pattern_valid, \ - set_dump_path, get_dump_mode, check_compare_param, check_configuration_param, load_stack_json, add_time_with_json + set_dump_path, get_dump_mode, check_compare_param, check_configuration_param, load_stack_json, get_file_type, \ + add_time_with_json from msprobe.core.compare.check import check_dump_json_str, check_stack_json_str, cross_dtype_mapping from msprobe.core.compare.utils import merge_tensor, print_compare_ends_info, read_op, \ reorder_op_x_list, set_stack_json_path, check_api_info_len @@ -44,11 +45,13 @@ class ComparisonConfig: stack_mode: bool auto_analyze: bool fuzzy_match: bool + highlight: bool data_mapping: dict suffix: str cell_mapping: dict api_mapping: dict layer_mapping: dict + compared_file_type: str first_diff_analyze: bool @@ -58,14 +61,18 @@ class Comparator: self.mode_config = mode_config self.mapping_config = mapping_config self.cross_frame = is_cross_framework - self.mapping_dict = MappingDict(mapping_config) - def process_output_file(self, output_path, suffix): + def process_output_file(self, output_path, suffix, compared_file_type): + file_name_prefix_mapping = { + Const.DUMP_JSON_FILE: "compare_result", + Const.DEBUG_JSON_FILE: "debug_compare_result" + } + file_name_prefix = file_name_prefix_mapping.get(compared_file_type, "compare_result") if self.mode_config.first_diff_analyze: file_name = add_time_with_json("compare_result" + suffix) else: - file_name = add_time_with_xlsx("compare_result" + suffix) + file_name = add_time_with_xlsx(file_name_prefix + suffix) file_path = os.path.join(os.path.realpath(output_path), file_name) if os.path.exists(file_path): logger.warning(f"{file_path} will be deleted.") @@ -95,7 +102,7 @@ class Comparator: suffix = kwargs.get('suffix', '') # process output file - file_path = self.process_output_file(output_path, suffix) + file_path = self.process_output_file(output_path, suffix, self.mode_config.compared_file_type) # initialize the compare result table and compare general data(name, dtype, shape, statistics/md5, etc.) npu_json = input_param.get("npu_json_path") @@ -107,7 +114,7 @@ class Comparator: return if self.mode_config.first_diff_analyze: - first_diff_analyze = FirstDiffAnalyze() + first_diff_analyze = FirstDiffAnalyze(self.mode_config) check_result = first_diff_analyze.check(result_df) save_json(file_path, check_result, indent=4) logger.info(f"Saving json file to disk: {file_path}") @@ -118,11 +125,20 @@ class Comparator: compare_real_data = CompareRealData(self.file_reader, self.mode_config, self.cross_frame) result_df = compare_real_data.do_multi_process(input_param, result_df) - # highlight suspicious API - highlight_dict = {"red_rows": set(), "yellow_rows": set(), "red_lines": [], "yellow_lines": []} - highlight = HighLight(self.mode_config) - highlight.find_compare_result_error_rows(result_df, highlight_dict) - highlight.highlight_rows_xlsx(result_df, highlight_dict, file_path) + # save result excel file + logger.info(f'Saving result excel file in progress. The file path is: {file_path}.') + if self.mode_config.highlight and len(result_df) <= CompareConst.MAX_EXCEL_LENGTH: + # highlight if not too long + highlight_dict = {"red_rows": set(), "yellow_rows": set(), "red_lines": [], "yellow_lines": []} + highlight = HighLight(self.mode_config) + if self.mode_config.compared_file_type == Const.DUMP_JSON_FILE: + highlight.find_compare_result_error_rows(result_df, highlight_dict) + result_df.drop(columns=['state', 'api_origin_name'], inplace=True) # 删除中间数据,两列不落盘 + highlight.highlight_rows_xlsx(result_df, highlight_dict, file_path) + else: + # fallback to simple save without highlight + result_df.drop(columns=['state', 'api_origin_name'], inplace=True) # 删除中间数据,两列不落盘 + save_excel(file_path, result_df) # output compare analysis suggestions if self.mode_config.auto_analyze: @@ -153,6 +169,8 @@ class Comparator: match_result.loc[~match.gen_dtype_condition(match_result), bench_columns] = CompareConst.N_A # organize compare result table by renaming columns + if self.mode_config.dump_mode == Const.ALL and self.mode_config.first_diff_analyze: + self.mode_config.dump_mode = Const.SUMMARY create_table = CreateTable(self.mode_config) result_df, header = create_table.make_result_df(match_result) @@ -183,10 +201,12 @@ class ParseData: Const.DTYPE: [], Const.SHAPE: [], Const.SUMMARY: [], - Const.STACK_INFO: [] + Const.STACK_INFO: [], + Const.STATE: [], + Const.API_ORIGIN_NAME: [] } if self.mode_config.dump_mode == Const.ALL: - result['data_name'] = [] + result[Const.DATA_NAME] = [] elif self.mode_config.dump_mode == Const.MD5: result[Const.MD5] = [] @@ -207,20 +227,22 @@ class ParseData: op_name_list = merge_list.get(CompareConst.OP_NAME) summary_list = merge_list.get(Const.SUMMARY) - data_name_list = merge_list.get('data_name') - op_name_reorder, summary_reorder, data_name_reorder = reorder_op_x_list(op_name_list, - summary_list, - data_name_list) + data_name_list = merge_list.get(Const.DATA_NAME) + state_list = merge_list.get(Const.STATE) + op_name_reorder, summary_reorder, data_name_reorder, state_reorder = reorder_op_x_list(op_name_list, + summary_list, + data_name_list, + state_list) # 遍历单个API的所有item - for index, op_name in enumerate(op_name_reorder): + for index, (op_name, state) in enumerate(zip(op_name_reorder, state_reorder)): result[CompareConst.OP_NAME].append(op_name) - if (CompareConst.INPUT_PATTERN in op_name) or (CompareConst.KWARGS_PATTERN in op_name): + if state == Const.INPUT or state == Const.KWARGS: info_list = merge_list[CompareConst.INPUT_STRUCT] - elif CompareConst.OUTPUT_PATTERN in op_name: + elif state == Const.OUTPUT: info_list = merge_list[CompareConst.OUTPUT_STRUCT] - elif CompareConst.PARAMS_PATTERN in op_name: + elif state == Const.PARAMS: info_list = merge_list[CompareConst.PARAMS_STRUCT] - elif CompareConst.PARAMS_GRAD_PATTERN in op_name: + elif state == Const.PARAMS_GRAD: info_list = merge_list[CompareConst.PARAMS_GRAD_STRUCT] else: info_list = merge_list[CompareConst.DEBUG_STRUCT] @@ -245,14 +267,18 @@ class ParseData: if self.mode_config.dump_mode == Const.ALL: check_api_info_len(op_name, data_name_reorder, 1) - result['data_name'].append(data_name_reorder.pop(0)) + result[Const.DATA_NAME].append(data_name_reorder.pop(0)) + + result[Const.STATE].append(state) + result[Const.API_ORIGIN_NAME].append(data_name) progress_bar.update(1) progress_bar.close() return pd.DataFrame(result) def gen_merge_list(self, json_data, op_name, stack_json_data): op_data = json_data['data'][op_name] - check_dump_json_str(op_data, op_name) + if self.mode_config.compared_file_type == Const.DUMP_JSON_FILE: + check_dump_json_str(op_data, op_name) op_parsed_list = read_op(op_data, op_name) if self.mode_config.stack_mode: @@ -416,8 +442,8 @@ class Match: @staticmethod def put_unmatched_in_table(match_result, npu_op_item): npu_columns = npu_op_item.index.tolist()[:-2] - new_columns = [name[:-1] + 'y' for name in npu_columns] - na_series = pd.Series([CompareConst.N_A] * len(new_columns), index=new_columns) + bench_columns = [name + '_y' for name in npu_columns] + na_series = pd.Series([CompareConst.N_A] * len(bench_columns), index=bench_columns) new_result_item = pd.concat([npu_op_item, na_series]).to_frame().T new_result_item.columns = CompareConst.MATCH_RESULT_COLUMNS match_result = pd.concat([match_result, new_result_item]) @@ -477,7 +503,6 @@ class Match: categories=op_name_order, ordered=True) match_result = match_result.sort_values(CompareConst.OP_NAME_X).reset_index(drop=True) match_result[CompareConst.OP_NAME_X] = match_result[CompareConst.OP_NAME_X].astype('object') - elif not self.mode_config.fuzzy_match: match_result = pd.merge(npu_df, bench_df, on=[CompareConst.CMP_KEY, CompareConst.CMP_SHAPE], how='outer') @@ -614,7 +639,9 @@ class CreateTable: 'md5_x': CompareConst.NPU_MD5, 'md5_y': CompareConst.BENCH_MD5, 'data_name_x': CompareConst.DATA_NAME, - 'stack_info_x': CompareConst.STACK}, inplace=True) + 'stack_info_x': CompareConst.STACK, + 'state_x': Const.STATE, + 'api_origin_name_x': Const.API_ORIGIN_NAME}, inplace=True) # process summary data npu_summary = [CompareConst.NPU_MAX, CompareConst.NPU_MIN, CompareConst.NPU_MEAN, CompareConst.NPU_NORM] @@ -627,6 +654,7 @@ class CreateTable: result[npu_summary] = result['summary_x'].apply(self.set_summary).tolist() result[bench_summary] = result['summary_y'].apply(self.set_summary).tolist() + header.extend([Const.STATE, Const.API_ORIGIN_NAME]) result_df = pd.DataFrame(columns=header) for h in header: if h in result.columns: @@ -677,7 +705,7 @@ class CalcStatsDiff: # 相对误差转成百分比字符串 cond_ref_err = cond_not_nan_diff & ~condition_pt_zero result_df.loc[cond_ref_err, rel_err_name] = ( - result_df.loc[cond_ref_err, diff_name] / bench_val[cond_ref_err] * 100) + result_df.loc[cond_ref_err, diff_name] / bench_val[cond_ref_err].astype(float) * 100) result_df.loc[cond_ref_err, rel_err_name] = (result_df.loc[cond_ref_err, rel_err_name].abs().astype(str) + '%') magnitude = self.get_number(result_df[diff_name]).abs() / (pd.Series( @@ -690,7 +718,11 @@ class CalcStatsDiff: result_df[condition_no_bench] = result_df[condition_no_bench].fillna(CompareConst.N_A) result_df.loc[condition_no_bench, CompareConst.ERROR_MESSAGE] = CompareConst.NO_BENCH - if self.mode_config.first_diff_analyze or self.mode_config.dump_mode == Const.SUMMARY: + if self.mode_config.dump_mode == Const.MD5: + condition_md5_equal = result_df[CompareConst.NPU_MD5] == result_df[CompareConst.BENCH_MD5] + result_df.loc[condition_md5_equal, CompareConst.RESULT] = CompareConst.PASS + result_df.loc[~condition_md5_equal & ~condition_no_bench, CompareConst.RESULT] = CompareConst.DIFF + elif self.mode_config.first_diff_analyze or self.mode_config.dump_mode == Const.SUMMARY: warning_list = [ self.calc_summary_diff(result_df, condition_no_bench, stats_index) for stats_index in ['max', 'min', 'mean', 'l2norm'] @@ -699,10 +731,6 @@ class CalcStatsDiff: result_df.loc[~condition_no_bench, [CompareConst.RESULT, CompareConst.ERROR_MESSAGE]] = '' result_df.loc[warning_flag, CompareConst.RESULT] = CompareConst.WARNING result_df.loc[warning_flag, CompareConst.ERROR_MESSAGE] = 'Need double check api accuracy.' - elif self.mode_config.dump_mode == Const.MD5: - condition_md5_equal = result_df[CompareConst.NPU_MD5] == result_df[CompareConst.BENCH_MD5] - result_df.loc[condition_md5_equal, CompareConst.RESULT] = CompareConst.PASS - result_df.loc[~condition_md5_equal & ~condition_no_bench, CompareConst.RESULT] = CompareConst.DIFF else: fill_cols = [CompareConst.COSINE, CompareConst.EUC_DIST, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, @@ -722,16 +750,19 @@ def setup_comparison(input_param, output_path, **kwargs) -> ComparisonConfig: stack_mode=False, auto_analyze=kwargs.get('auto_analyze', True), fuzzy_match=kwargs.get('fuzzy_match', False), + highlight=kwargs.get('highlight', False), data_mapping=kwargs.get('data_mapping', {}), suffix=kwargs.get('suffix', ''), cell_mapping=kwargs.get('cell_mapping', {}), api_mapping=kwargs.get('api_mapping', {}), layer_mapping=kwargs.get('layer_mapping', {}), - first_diff_analyze=kwargs.get('first_diff_analyze', False) + first_diff_analyze=kwargs.get('first_diff_analyze', False), + compared_file_type='', ) set_dump_path(input_param) config.dump_mode = get_dump_mode(input_param) + config.compared_file_type = get_file_type(input_param.get("npu_json_path", None)) # set stack_mode and set "stack_json_path" in input_param if 'stack_json_path' in input_param: @@ -739,7 +770,7 @@ def setup_comparison(input_param, output_path, **kwargs) -> ComparisonConfig: else: config.stack_mode = set_stack_json_path(input_param) - check_configuration_param(config.stack_mode, config.auto_analyze, config.fuzzy_match, + check_configuration_param(config.stack_mode, config.auto_analyze, config.fuzzy_match, config.highlight, input_param.get('is_print_compare_log', True)) create_directory(output_path) check_compare_param(input_param, output_path, config.dump_mode, config.stack_mode) diff --git a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py index 7df7315043cb57b057871a7d12f5aa63cf927c74..08af3aab60421c935466e0286e96cacddf1ee037 100644 --- a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py +++ b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py @@ -14,7 +14,7 @@ # limitations under the License. import json -from msprobe.core.common.file_utils import check_file_type, load_json +from msprobe.core.common.file_utils import check_file_type, load_json, check_file_or_directory_path from msprobe.core.common.const import FileCheckConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.common.log import logger @@ -22,30 +22,38 @@ from msprobe.core.common.log import logger def compare_cli(args): input_param = load_json(args.input_path) + if not isinstance(input_param, dict): + logger.error("input_param should be dict, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) npu_path = input_param.get("npu_path", None) bench_path = input_param.get("bench_path", None) if not npu_path: - logger.error(f"Missing npu_path in configuration file {args.input_path}, please check!") + logger.error(f"Missing npu_path in input configuration file, please check!") raise CompareException(CompareException.INVALID_PATH_ERROR) if not bench_path: - logger.error(f"Missing bench_path in configuration file {args.input_path}, please check!") + logger.error(f"Missing bench_path in input configuration file, please check!") raise CompareException(CompareException.INVALID_PATH_ERROR) frame_name = args.framework auto_analyze = not args.compare_only + if frame_name == Const.PT_FRAMEWORK: from msprobe.pytorch.compare.pt_compare import compare from msprobe.pytorch.compare.distributed_compare import compare_distributed else: from msprobe.mindspore.compare.ms_compare import ms_compare from msprobe.mindspore.compare.distributed_compare import ms_compare_distributed, ms_graph_compare + from msprobe.mindspore.compare.common_dir_compare import common_dir_compare common_kwargs = { "auto_analyze": auto_analyze, "fuzzy_match": args.fuzzy_match, + "highlight": args.highlight, "data_mapping": args.data_mapping, } if check_file_type(npu_path) == FileCheckConst.FILE and check_file_type(bench_path) == FileCheckConst.FILE: + check_file_or_directory_path(npu_path) + check_file_or_directory_path(bench_path) input_param["npu_json_path"] = input_param.pop("npu_path") input_param["bench_json_path"] = input_param.pop("bench_path") if "stack_path" not in input_param: @@ -67,6 +75,8 @@ def compare_cli(args): } ms_compare(input_param, args.output_path, **kwargs) elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR: + check_file_or_directory_path(npu_path, isdir=True) + check_file_or_directory_path(bench_path, isdir=True) kwargs = { **common_kwargs, "stack_mode": args.stack_mode, @@ -78,6 +88,10 @@ def compare_cli(args): if input_param.get("rank_id") is not None: ms_graph_compare(input_param, args.output_path) return + common = input_param.get("common", False) + if isinstance(common, bool) and common: + common_dir_compare(input_param, args.output_path) + return if frame_name == Const.PT_FRAMEWORK: compare_distributed(npu_path, bench_path, args.output_path, **kwargs) else: diff --git a/debug/accuracy_tools/msprobe/core/compare/config.py b/debug/accuracy_tools/msprobe/core/compare/config.py index 53fe857453d31c79776c7b1c5f55ee85b83ca426..71a512ea976c1a027e7cd21b3d0fdc64c2828542 100644 --- a/debug/accuracy_tools/msprobe/core/compare/config.py +++ b/debug/accuracy_tools/msprobe/core/compare/config.py @@ -20,13 +20,14 @@ from msprobe.core.common.file_utils import load_yaml class ModeConfig: - def __init__(self, stack_mode=False, auto_analyze=True, fuzzy_match=False, dump_mode=Const.SUMMARY, - first_diff_analyze=False): - self.stack_mode = stack_mode - self.auto_analyze = auto_analyze - self.fuzzy_match = fuzzy_match - self.dump_mode = dump_mode - self.first_diff_analyze = first_diff_analyze + def __init__(self, **kwargs): + self.stack_mode = kwargs.get('stack_mode', False) + self.auto_analyze = kwargs.get('auto_analyze', True) + self.fuzzy_match = kwargs.get('fuzzy_match', False) + self.highlight = kwargs.get('highlight', False) + self.dump_mode = kwargs.get('dump_mode', Const.SUMMARY) + self.first_diff_analyze = kwargs.get('first_diff_analyze', False) + self.compared_file_type = kwargs.get('compared_file_type', Const.DUMP_JSON_FILE) class MappingConfig: diff --git a/debug/accuracy_tools/msprobe/core/compare/diff_analyze/first_diff_analyze.py b/debug/accuracy_tools/msprobe/core/compare/diff_analyze/first_diff_analyze.py index ef2f7c5487e0e6fb9d19878b1e333e8ef077cbeb..f1192d895190dfce472bf82b6d213a2fa081d210 100644 --- a/debug/accuracy_tools/msprobe/core/compare/diff_analyze/first_diff_analyze.py +++ b/debug/accuracy_tools/msprobe/core/compare/diff_analyze/first_diff_analyze.py @@ -15,9 +15,12 @@ import os -from msprobe.core.common.utils import safe_get_value, logger, CompareException +from msprobe.core.common.const import Const, CompareConst +from msprobe.core.common.utils import logger, CompareException from msprobe.core.common.file_utils import load_yaml -from msprobe.core.compare.utils import api_batches_update, get_name_and_state +from msprobe.core.compare.config import ModeConfig +from msprobe.core.compare.utils import gen_api_batches + cur_dir = os.path.dirname(os.path.realpath(__file__)) diff_threshold_yaml_path = os.path.join(cur_dir, 'diff_analyze_threshold.yaml') @@ -26,6 +29,9 @@ cmp_metrics = thresholds.get('compare_metrics') class FirstDiffAnalyze: + def __init__(self, mode_config: ModeConfig): + self.mode_config = mode_config + @staticmethod def single_metric_diff_check(cmp_metric, metric_value): threshold = thresholds.get(cmp_metric, None) @@ -64,16 +70,21 @@ class FirstDiffAnalyze: } single_check_result['op_items'].append(op_item) - for cmp_metric in cmp_metrics: - metric_value = line[column_indices[cmp_metric]] - if self.single_metric_diff_check(cmp_metric, metric_value): + # set is_same + if self.mode_config.dump_mode == Const.MD5: + if line[column_indices[CompareConst.RESULT]] == CompareConst.DIFF: single_check_result['is_same'] = False - break + else: + for cmp_metric in cmp_metrics: + metric_value = line[column_indices[cmp_metric]] + if self.single_metric_diff_check(cmp_metric, metric_value): + single_check_result['is_same'] = False + break return single_check_result def check(self, result_df): """ - 比对后循环遍历api检查norm差异 + 比对后循环遍历api检查差异 example: { 'Functional.conv2d.0.forward': { @@ -94,15 +105,11 @@ class FirstDiffAnalyze: result = result_df.values header = result_df.columns.tolist() - api_batches = [] - for i, res_i in enumerate(result): - api_full_name = safe_get_value(res_i, 0, "res_i") - api_name, state = get_name_and_state(api_full_name) - api_batches_update(api_batches, api_name, state, i) + api_batches = gen_api_batches(result) check_result = {} for api_batch in api_batches: result_slice = result[api_batch.start: api_batch.params_grad_end_index] - check_result[api_batch.api_name[: -1]] = self.single_api_check(result_slice, header) + check_result[api_batch.api_name] = self.single_api_check(result_slice, header) return check_result diff --git a/debug/accuracy_tools/msprobe/core/compare/highlight.py b/debug/accuracy_tools/msprobe/core/compare/highlight.py index 560ebcdc7a5e5265ff4e869f54487b5c0b2ed83f..25eea7a5b43196819a974009cc7783f6275ea190 100644 --- a/debug/accuracy_tools/msprobe/core/compare/highlight.py +++ b/debug/accuracy_tools/msprobe/core/compare/highlight.py @@ -16,7 +16,6 @@ import abc import math import multiprocessing -import re from collections import namedtuple import numpy as np @@ -28,8 +27,8 @@ from tqdm import tqdm from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.file_utils import save_workbook from msprobe.core.common.log import logger -from msprobe.core.common.utils import get_header_index, safe_get_value -from msprobe.core.compare.utils import table_value_is_valid, get_name_and_state, CompareException, api_batches_update +from msprobe.core.common.utils import get_header_index +from msprobe.core.compare.utils import table_value_is_valid, gen_api_batches from msprobe.core.compare.config import ModeConfig @@ -218,11 +217,7 @@ class HighLight: def find_compare_result_error_rows(self, result_df, highlight_dict): """将dataframe根据API分组,并找到有误差的算子用于高亮""" result = result_df.values - api_batches = [] - for i, res_i in enumerate(result): - api_full_name = safe_get_value(res_i, 0, "res_i") - api_name, state = get_name_and_state(api_full_name) - api_batches_update(api_batches, api_name, state, i) + api_batches = gen_api_batches(result) with tqdm(total=len(api_batches), desc="API/Module Analyse Progress", unit="item", ncols=100) as progress_bar: for api_batch in api_batches: self.find_error_rows(result[api_batch.start: api_batch.params_grad_end_index], api_batch, @@ -294,28 +289,19 @@ class HighLight: self.update_highlight_err_msg(result_df, highlight_dict) # add highlight err_msg - wb = openpyxl.Workbook() - ws = wb.active - - # write header - logger.info('Initializing Excel file.') - self.handle_multi_process_malicious_value_check(self.df_malicious_value_check, result_df) + wb = openpyxl.Workbook() + ws = wb.active result_df_convert = result_df.applymap(self.compare_result_df_convert) - for row in dataframe_to_rows(result_df_convert, index=False, header=True): ws.append(row) # 对可疑数据标色 logger.info('Coloring Excel in progress.') + red_fill = PatternFill(start_color=CompareConst.RED, end_color=CompareConst.RED, fill_type="solid") + yellow_fill = PatternFill(start_color=CompareConst.YELLOW, end_color=CompareConst.YELLOW, fill_type="solid") col_len = len(result_df.columns) - red_fill = PatternFill( - start_color=CompareConst.RED, end_color=CompareConst.RED, fill_type="solid" - ) - yellow_fill = PatternFill( - start_color=CompareConst.YELLOW, end_color=CompareConst.YELLOW, fill_type="solid", - ) for i in highlight_dict.get("red_rows", []): for j in range(1, col_len + 1): ws.cell(row=i + 2, column=j).fill = red_fill # 2因为ws.cell中的row或column需要>=1,数据从第2行开始 @@ -323,7 +309,6 @@ class HighLight: for j in range(1, col_len + 1): ws.cell(row=i + 2, column=j).fill = yellow_fill - logger.info('Saving Excel file to disk: %s' % file_path) save_workbook(wb, file_path) def handle_multi_process_malicious_value_check(self, func, result_df): diff --git a/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py b/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py index 4845adb0482b1a6cca988e876a1315e56589e87a..91927f963a9170bd3ee218ff04f6302f01d9ee7c 100644 --- a/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py +++ b/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py @@ -208,7 +208,8 @@ def generate_data_mapping(npu_json_path, bench_json_path, api_mapping, output_pa def read_full_op_names(data, op_name): op_parsed_list = read_op(data.get(op_name, {}), op_name) full_op_names = [op_parsed.get('full_op_name') for op_parsed in op_parsed_list] - return full_op_names + states = [op_parsed.get(Const.STATE) for op_parsed in op_parsed_list] + return full_op_names, states def generate_op_data_mapping(npu_op_name, npu_full_op_names, bench_op_name, bench_full_op_names): suffix_to_full_op_name = {} @@ -228,10 +229,10 @@ def generate_data_mapping(npu_json_path, bench_json_path, api_mapping, output_pa for npu_op_name, bench_op_name in api_mapping.items(): if not npu_op_name: continue - npu_full_op_names = read_full_op_names(npu_data, npu_op_name) - bench_full_op_names = read_full_op_names(bench_data, bench_op_name) - npu_full_op_names_reorder = reorder_op_name_list(npu_full_op_names) - bench_full_op_names_reorder = reorder_op_name_list(bench_full_op_names) + npu_full_op_names, npu_states = read_full_op_names(npu_data, npu_op_name) + bench_full_op_names, bench_states = read_full_op_names(bench_data, bench_op_name) + npu_full_op_names_reorder, _ = reorder_op_name_list(npu_full_op_names, npu_states) + bench_full_op_names_reorder, _ = reorder_op_name_list(bench_full_op_names, bench_states) mapping = generate_op_data_mapping(npu_op_name, npu_full_op_names_reorder, bench_op_name, bench_full_op_names_reorder) data_mapping.update(mapping) diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index 8bba8c9aa0a34d0701308ec3c77ad26a5c13411b..19c66e83e98c683db970397c73305c0b8ea530e2 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -94,8 +94,8 @@ class CompareRealData: @staticmethod def read_dump_data(result_df): try: - npu_dump_name_list = result_df.iloc[0:, 0].tolist() - dump_tensor_pair_list = result_df.iloc[0:, -1].tolist() + npu_dump_name_list = result_df.loc[0:, CompareConst.NPU_NAME].tolist() + dump_tensor_pair_list = result_df.loc[0:, CompareConst.DATA_NAME].tolist() op_name_mapping_dict = {} for index, npu_dump_name in enumerate(npu_dump_name_list): dump_tensor_pair = dump_tensor_pair_list[index] @@ -104,9 +104,9 @@ class CompareRealData: except ValueError as e: logger.error('result dataframe is not found.') raise CompareException(CompareException.INVALID_DATA_ERROR) from e - except IndexError as e: + except KeyError as e: logger.error('result dataframe elements can not be access.') - raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e + raise CompareException(CompareException.INVALID_KEY_ERROR) from e @staticmethod def _save_cmp_result(offset, result: ComparisonResult, result_df, lock): diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 4da1adfa0908b66679b11f9f3ffadaaf237c1177..c0b0783da9ba6e83a51c5eb2e6224cef00b0a8b1 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -26,28 +26,39 @@ from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value from msprobe.core.common.file_utils import check_file_or_directory_path +json_file_mapping = { + Const.DUMP_JSON_FILE: "dump.json", + Const.DEBUG_JSON_FILE: "debug.json", + Const.STACK_JSON_FILE: "stack.json" +} -def extract_json(dirname, stack_json=False): + +def extract_json(dirname, json_file_type): json_path = '' for filename in os.listdir(dirname): - target_file_name = 'stack.json' if stack_json else 'dump.json' + target_file_name = json_file_mapping.get(json_file_type) + if target_file_name is None: + logger.error(f'extract_json failed, invalid json_file_type: {json_file_type}.') + raise CompareException(CompareException.INVALID_KEY_ERROR) if filename == target_file_name: json_path = os.path.join(dirname, filename) break # Provide robustness on invalid directory inputs if not json_path: - if stack_json: + if json_file_type == Const.STACK_JSON_FILE: logger.warning(f'stack.json is not found in dump dir {dirname}.') - else: + elif json_file_type == Const.DUMP_JSON_FILE: logger.error(f'dump.json is not found in dump dir {dirname}.') - raise CompareException(CompareException.NO_DUMP_FILE_ERROR) + elif json_file_type == Const.DEBUG_JSON_FILE: + logger.warning(f'debug.json is not found in dump dir {dirname}.') + return json_path def set_stack_json_path(input_param): npu_data_dir = os.path.dirname(input_param.get("npu_json_path")) - stack_path = extract_json(npu_data_dir, stack_json=True) + stack_path = extract_json(npu_data_dir, json_file_type=Const.STACK_JSON_FILE) input_param["stack_json_path"] = stack_path if stack_path else None return bool(stack_path) @@ -83,17 +94,25 @@ def check_and_return_dir_contents(dump_dir, prefix): def read_op(op_data, op_name): - if Const.PARAMS_GRAD in op_name.split(Const.SEP): - op_parsed_list = op_item_parse(op_data, op_name) + if not isinstance(op_name, str): + logger.error(f"api name error: {op_name} is not a string, please check.") + raise CompareException(CompareException.INVALID_API_NAME_ERROR) + split_name = op_name.split(Const.SEP) + if split_name[-1] == Const.DEBUG: + op_parsed_list = op_item_parse(op_data, op_name, Const.DEBUG) + elif split_name[-1] == Const.PARAMS_GRAD: + op_parsed_list = op_item_parse(op_data, op_name, Const.PARAMS_GRAD) else: op_parsed_list = [] for name in CompareConst.IO_NAME_MAPPING: if name in op_data: - op_parsed_list.extend(op_item_parse(op_data[name], op_name + CompareConst.IO_NAME_MAPPING[name])) + op_parsed_list.extend(op_item_parse(op_data[name], op_name + CompareConst.IO_NAME_MAPPING[name], name)) return op_parsed_list -def op_item_parse(op_data, op_name: str, depth: int = 0) -> list: +def op_item_parse(op_data, op_name: str, state: str, depth: int = 0) -> list: + if state == Const.INPUT_ARGS or state == Const.INPUT_KWARGS: + state = Const.INPUT default_item = { 'full_op_name': op_name, 'type': None, @@ -105,7 +124,8 @@ def op_item_parse(op_data, op_name: str, depth: int = 0) -> list: 'shape': None, 'md5': None, 'value': None, - 'data_name': '-1' + 'data_name': '-1', + 'state': state } if depth > Const.MAX_DEPTH: @@ -121,14 +141,14 @@ def op_item_parse(op_data, op_name: str, depth: int = 0) -> list: if isinstance(op_data, list): for i, data in enumerate(op_data): if Const.PARAMS_GRAD not in op_name.split(Const.SEP): - item_list.extend(op_item_parse(data, op_name + Const.SEP + str(i), depth + 1)) + item_list.extend(op_item_parse(data, op_name + Const.SEP + str(i), state, depth + 1)) else: - item_list.extend(op_item_parse(data, op_name, depth + 1)) + item_list.extend(op_item_parse(data, op_name, state, depth + 1)) elif isinstance(op_data, dict): if is_leaf_data(op_data): - return [gen_op_item(op_data, op_name)] + return [gen_op_item(op_data, op_name, state)] for sub_name, sub_data in op_data.items(): - item_list.extend(op_item_parse(sub_data, op_name + Const.SEP + str(sub_name), depth + 1)) + item_list.extend(op_item_parse(sub_data, op_name + Const.SEP + str(sub_name), state, depth + 1)) return item_list @@ -136,18 +156,21 @@ def is_leaf_data(op_data): return 'type' in op_data and isinstance(op_data['type'], str) -def gen_op_item(op_data, op_name): +def gen_op_item(op_data, op_name, state): op_item = {} - op_item.update(op_data) - data_name = op_data.get('data_name') if op_data.get('data_name') else '-1' # 如果是""也返回-1 - op_item['data_name'] = data_name + op_item.update({key: str(value) if isinstance(value, bool) else value for key, value in op_data.items()}) + data_name = op_data.get(Const.DATA_NAME) if op_data.get(Const.DATA_NAME) else '-1' # 如果是""也返回-1 + op_item[Const.DATA_NAME] = data_name op_item['full_op_name'] = data_name.rsplit(Const.SEP, 1)[0] if data_name != '-1' else op_name + op_item[Const.STATE] = state - params = ['Max', 'Min', 'Mean', 'Norm'] + # 补齐统计量字段 + params = [Const.MAX, Const.MIN, Const.MEAN, Const.NORM] for i in params: if i not in op_item: op_item[i] = None + # special cases if not op_item.get('dtype'): if op_item.get('type') == 'torch.Size': op_item['dtype'] = op_data.get('type') @@ -160,11 +183,18 @@ def gen_op_item(op_data, op_name): op_item['shape'] = '[]' for i in params: op_item[i] = op_data.get('value') + elif op_name.split(Const.SEP)[-1] in ['src', 'dst', 'group_src', 'group_dst']: + op_item['dtype'] = op_data.get('type') + op_item['shape'] = '[]' + for i in params: + op_item[i] = str(op_data.get('value')) + op_item['md5'] = str(op_data.get('value')) elif op_item.get('type') == 'torch.ProcessGroup': op_item['dtype'] = op_data.get('type') op_item['shape'] = '[]' for i in params: op_item[i] = str(op_data.get('group_ranks')) + op_item['md5'] = str(op_data.get('group_ranks')) else: op_item['dtype'] = str(type(op_data.get('value'))) op_item['shape'] = '[]' @@ -191,8 +221,10 @@ def merge_tensor(tensor_list, dump_mode): CompareConst.OUTPUT_STRUCT, CompareConst.PARAMS_STRUCT, CompareConst.PARAMS_GRAD_STRUCT, + CompareConst.DEBUG_STRUCT, Const.SUMMARY, - Const.STACK_INFO + Const.STACK_INFO, + Const.STATE ] op_dict = {key: [] for key in keys} @@ -202,12 +234,13 @@ def merge_tensor(tensor_list, dump_mode): for tensor in tensor_list: # A dict(len=2) with 'full_op_name' and 'full_info' is added to the tensor only if self.stack_mode is True if len(tensor) == 2: - op_dict[Const.STACK_INFO].append(tensor['full_info']) + op_dict[Const.STACK_INFO].append(tensor.get('full_info')) break - op_dict[CompareConst.OP_NAME].append(tensor['full_op_name']) + op_dict[CompareConst.OP_NAME].append(tensor.get('full_op_name')) + state = tensor.get(Const.STATE) + op_dict[Const.STATE].append(state) - _, state = get_name_and_state(tensor['full_op_name']) struct_key = CompareConst.STATE_TO_STRUCT_MAPPING.get(state) if not struct_key: continue @@ -308,78 +341,54 @@ def api_batches_update(api_batches, api_name, state, index): api_batches.append(ApiBatch(api_name, index)) -def get_name_and_state(name): - """ - Get api/module name and state - example: - name = 'conv2d.forward.1.input.0' - return: ('conv2d.forward.1.', 'input') - - name = 'Functional.pad.0.backward.output.0' - return: ('Functional.pad.0.backward.', 'output') - - state type: input, output, kwargs, parameters, parameters_grad - """ - if not isinstance(name, str): - logger.error(f'Invalid name: {name}, type should be string, please check.') - raise CompareException(CompareException.INVALID_API_NAME_ERROR) - - if Const.PARAMS_GRAD in name.split(Const.SEP): - return name.split(Const.PARAMS_GRAD)[0], Const.PARAMS_GRAD - - split = re.split(Const.REGEX_FORWARD_BACKWARD, name) - if len(split) < 3: - logger.error(f'Invalid name string: {name}, can not be split by forward/backward, please check.') - raise CompareException(CompareException.INVALID_API_NAME_ERROR) - api = f'{split[0]}.{split[1]}.' - state_str = split[2] - match = re.match(r'^(\d+\.)?(input|output|kwargs|parameters)\..+$', state_str) - if not match: - raise CompareException(f'Invalid name string: {name}') - if match.group(1): - api = f'{api}{match.group(1)}' - state = match.group(2) - return api, state - - -def reorder_op_name_list(op_name_list): +def reorder_op_name_list(op_name_list, state_list): if not op_name_list: - return op_name_list + return op_name_list, state_list parameters = [] output = [] parameters_grad = [] others = [] - for x in op_name_list: - state = get_name_and_state(x)[1] + parameters_s = [] + output_s = [] + parameters_grad_s = [] + others_s = [] + for op_name, state in zip(op_name_list, state_list): if state == Const.PARAMS: - parameters.append(x) + parameters.append(op_name) + parameters_s.append(state) elif state == Const.OUTPUT: - output.append(x) + output.append(op_name) + output_s.append(state) elif state == Const.PARAMS_GRAD: - parameters_grad.append(x) + parameters_grad.append(op_name) + parameters_grad_s.append(state) else: - others.append(x) + others.append(op_name) + others_s.append(state) # 合并others, parameters, 和output,确保parameters排在output前面 op_name_reorder = others + parameters + output + parameters_grad - return op_name_reorder + state_reorder = others_s + parameters_s + output_s + parameters_grad_s + return op_name_reorder, state_reorder -def reorder_op_x_list(op_name_list, summary_list, data_name_list): - """对op_name, summary, data_name重新排序,把parameters放到input后output前,data_name由于统计量比对时,为None,单独处理""" +def reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list): + """ + 对op_name, summary, data_name, state重新排序,把parameters放到input后output前,data_name由于统计量比对时,为None,单独处理 + """ if not op_name_list or not summary_list: - return op_name_list, summary_list, data_name_list + return op_name_list, summary_list, data_name_list, state_list index_map = {name: index for index, name in enumerate(op_name_list)} - op_name_reorder = reorder_op_name_list(op_name_list) + op_name_reorder, state_order = reorder_op_name_list(op_name_list, state_list) summary_reorder = [summary_list[index_map.get(name)] for name in op_name_reorder] if data_name_list: data_name_reorder = [data_name_list[index_map.get(name)] for name in op_name_reorder] else: data_name_reorder = data_name_list - return op_name_reorder, summary_reorder, data_name_reorder + return op_name_reorder, summary_reorder, data_name_reorder, state_order def process_summary_data(summary_data): @@ -586,6 +595,15 @@ def make_result_table(result, dump_mode, stack_mode): return result_df +def gen_api_batches(result: np.ndarray): + api_batches = [] + for i, res_i in enumerate(result): + api_name = safe_get_value(res_i, -1, "res_i") # 内部定义倒数第一个元素必是api_origin_name + state = safe_get_value(res_i, -2, "res_i") # 内部定义倒数第二个元素必是state + api_batches_update(api_batches, api_name, state, i) + return api_batches + + def _compare_parser(parser): parser.add_argument("-i", "--input_path", dest="input_path", type=str, help=" The compare input path, a dict json.", required=True) @@ -598,6 +616,8 @@ def _compare_parser(parser): help=" Whether to give advisor.", required=False) parser.add_argument("-f", "--fuzzy_match", dest="fuzzy_match", action="store_true", help=" Whether to perform a fuzzy match on the api name.", required=False) + parser.add_argument("-hl", "--highlight", dest="highlight", action="store_true", + help=" Whether to set result highlighting.", required=False) parser.add_argument("-cm", "--cell_mapping", dest="cell_mapping", type=str, nargs='?', const=True, help=" The cell mapping file path.", required=False) parser.add_argument("-am", "--api_mapping", dest="api_mapping", type=str, nargs='?', const=True, @@ -606,3 +626,43 @@ def _compare_parser(parser): help=" The data mapping file path.", required=False) parser.add_argument("-lm", "--layer_mapping", dest="layer_mapping", type=str, nargs='?', const=True, help=" The layer mapping file path.", required=False) + + +def compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, compare_func, **kwargs): + if not isinstance(kwargs.get('first_diff_analyze', False), bool): + logger.error('kwargs: first_diff_analyze should be bool, please check!') + raise CompareException(CompareException.INVALID_PARAM_ERROR) + if kwargs.get('suffix'): + logger.error("Argument 'suffix' is not supported for compare_distributed.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + is_print_compare_log = kwargs.get('is_print_compare_log', True) + # get the ranks and match by order + npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) + bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) + if len(npu_ranks) != len(bench_ranks): + logger.error('The number of ranks in the two runs are different. ' + 'Unable to match the ranks. Please use another folder to compare ' + 'or use compare() api and manually match the ranks.') + raise CompareException(CompareException.INVALID_PATH_ERROR) + for nr, br in zip(npu_ranks, bench_ranks): + npu_data_dir = os.path.join(npu_dump_dir, nr) + bench_data_dir = os.path.join(bench_dump_dir, br) + for file_type in [Const.DUMP_JSON_FILE, Const.DEBUG_JSON_FILE]: + npu_path = extract_json(npu_data_dir, file_type) + bench_path = extract_json(bench_data_dir, file_type) + if npu_path == "" or bench_path == "": + logger.debug(f'Did not find paired {file_type} in {nr} and {br},' + ' skip comparing.') + continue + dump_result_param = { + 'npu_json_path': npu_path, + 'bench_json_path': bench_path, + 'is_print_compare_log': is_print_compare_log + } + try: + compare_func(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}', **kwargs) + except CompareException as e: + if e.code == CompareException.INVALID_DATA_ERROR: + logger.error(f"Invalid or missing 'data' in dump.json. Skipping {nr} comparison.") + if e.code == CompareException.INVALID_TASK_ERROR: + logger.error(f"Invalid or missing 'task' in dump.json. Skipping {nr} comparison.") diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index 504e21b4b9190ea7766fee73bbbadba9a071760b..b4a0eb6ae4e603518e516390f1aecfe5283850dc 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -51,15 +51,16 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s 完整参数说明: -| 参数名 | 说明 | 是否必选 | -|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | -| -f 或 --framework | 指定训练框架。pytorch。 | 是 | -| -i 或 --input_path | 指定[比对文件](#214-比对文件),str 类型。 | 是 | -| -o 或 --output_path | 配置比对结果文件存盘目录,str 类型,默认在当前目录创建output目录。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 否 | -| -s 或 --stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,根据[比对文件](#214-比对文件)的参数说明配置stack_path;多卡场景开启时,自动识别npu_dump目录下stack.json文件,如存在生成详细调用栈信息,否则不生成,此参数不生效。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | +| 参数名 | 说明 | 是否必选 | +|---------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | +| -f 或 --framework | 指定训练框架。pytorch。 | 是 | +| -i 或 --input_path | 指定[比对文件](#51-比对文件),str 类型。 | 是 | +| -o 或 --output_path | 配置比对结果文件存盘目录,str 类型,默认在当前目录创建output目录。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 否 | +| -s 或 --stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,根据[比对文件](#51-比对文件)的参数说明配置stack_path;多卡场景开启时,自动识别npu_dump目录下stack.json文件,如存在生成详细调用栈信息,否则不生成,此参数不生效。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | | -c 或 --compare_only | 仅比对开关,bool 类型。该参数默认未配置,会启用自动精度分析,工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 `advisor_{timestamp}.txt` 文件)。通过配置该参数取消自动精度分析,仅输出比对结果表格。 | 否 | -| -f 或 --fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | -| -dm或--data_mapping | 自定义映射关系比对。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件](#215-自定义映射文件)。仅[API和模块无法自动匹配场景](#213-api和模块无法自动匹配场景)需要配置。仅支持逐卡比对,即使用[比对文件](#214-比对文件)的单卡场景示例。 | 否 | +| -f 或 --fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | +| -hl 或 --highlight | 高亮颜色标记。开启后,比对结果件中通过红色或黄色标记精度可疑API或模块。通过直接配置该参数开启,默认未配置,表示关闭。 开启高亮颜色标记后,比对性能降低,如果比对结果行数超出excel单页限制,程序强制关闭高亮颜色标记。 | 否 | +| -dm或--data_mapping | 自定义映射关系比对。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件](#52-自定义映射文件)。仅[API和模块无法自动匹配场景](#213-api和模块无法自动匹配场景)需要配置。仅支持逐卡比对,即使用[比对文件](#51-比对文件)的单卡场景示例。 | 否 | #### 2.1.2 整网比对场景 @@ -69,7 +70,7 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s 1. 参见 [PyTorch 场景下的数据采集](./05.data_dump_PyTorch.md)章节完成 CPU 或 GPU 与 NPU 的精度数据 dump。 -2. 创建[比对文件](#214-比对文件)。 +2. 创建[比对文件](#51-比对文件)。 3. 运行命令: @@ -87,7 +88,7 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s 2. 参见[PyTorch 场景下的数据采集](./05.data_dump_PyTorch.md)章节完成 CPU 或 GPU 与 NPU 的精度数据 dump。 -3. 创建[比对文件](#214-比对文件)(单卡场景示例)。 +3. 创建[比对文件](#51-比对文件)(单卡场景示例)。 4. 运行命令: @@ -95,174 +96,30 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s msprobe -f pytorch compare -i ./compare.json -o ./output -s -dm data_mapping.yaml ``` - data_mapping.yaml文件配置请参见[自定义映射文件](#215-自定义映射文件)。 + data_mapping.yaml文件配置请参见[自定义映射文件](#52-自定义映射文件)。 该场景不支持-f模糊匹配。 5. 查看比对结果,请参见 [3 精度比对结果分析](#3-精度比对结果分析)。 -#### 2.1.4 比对文件 - 以在当前目录创建 ./compare.json 为例。 - - - 单卡场景示例: - - ```json - { - "npu_path": "./npu_dump/dump.json", - "bench_path": "./bench_dump/dump.json", - "stack_path": "./npu_dump/stack.json", - "is_print_compare_log": true - } - ``` - - - 多卡场景示例: - - ```json - { - "npu_path": "./npu_dump/step0", # 需填写到step层级(rank的上一层级) - "bench_path": "./bench_dump/step0", # 需填写到step层级(rank的上一层级) - "is_print_compare_log": true - } - ``` - -**参数说明**: - -| 参数名 | 说明 | 是否必选 | -| -------------------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------| -| npu_path | 配置 NPU 环境下的 dump.json 文件(单卡场景)或 dump 目录(多卡场景),str 类型。 | 是 | -| bench_path | 配置 CPU、GPU 或 NPU 环境下的 dump.json 文件(单卡场景)或 dump 目录(多卡场景),str 类型。 | 是 | -| stack_path | 配置 NPU dump 目录下的 stack.json 文件,str 类型。如果没有配置stack_path,命令行-s参数不生效,程序自动识别是否存在stack.json文件,如存在,则比对结果中呈现NPU_Stack_Info,如不存在,则不呈现。如果配置了stack_path,比对结果中是否呈现NPU_Stack_Info则通过命令行参数-s来控制。 | 否 | -| is_print_compare_log | 配置是否开启单个算子的日志打屏。可取值 true 或 false,默认为 true。关闭后则只输出常规日志,bool 类型。 | 否 | - -#### 2.1.5 自定义映射文件 - -文件名格式:*.yaml,*为文件名,可自定义。 - -文件内容格式: - -```yaml -# API -{api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号}: {api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号} -# 模块 -{Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号}: {Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号} -``` - -冒号左侧和右侧分别为PyTorch框架不同版本或不同芯片环境的API的名称和module模块名称。 - -API和模块名称请从《[PyTorch 场景的精度数据采集](05.data_dump_PyTorch.md)》中的dump.json文件获取。 - -文件内容示例: +#### 2.1.4 单点数据比对场景 -```yaml -# API -NPU.npu_fusion_attention.4.forward.input.0: NPU.npu_fusion_attention.4.forward.input.0 -# 模块 -Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0: Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0 -``` +单点数据比对场景是指:CPU 或 GPU 与 NPU环境的网络中单点保存的数据比对。 -当dump.json文件中存在“data_name”字段时,API和模块名称为data_name字段去掉文件后缀,如下图红框处所示: +支持单卡和多卡,可同时比对多卡的单点数据。多机场景需要每个设备单独执行比对操作。 -![pt_dump](./img/pt_dump.png) +1. 参见 [单点保存工具](./28.debugger_save_instruction.md)章节完成 CPU 或 GPU 与 NPU 的单点数据采集。 -当dump.json文件中不存在“data_name”字段时,名称的拼写规则如下: +2. 创建[比对文件(单点数据)](#53-比对文件单点数据)。 -input_args、input_kwargs和output使用统一的命名规则,当值是list类型时,名称后面添加'.{index}',当值类型是dict类型时,名称后面加'.{key}',当值类型是具体Tensor或null或int或float或bool或空list/dict等时,命名结束。 +3. 运行命令: -以下面api的dump文件为例: -```yaml - "Functional.max_pool2d.0.forward": { - "input_args": [ - { - "type": "torch.Tensor", - "dytpe": "torch_float32", - "shape": [ - 1, - 64, - 14, - 14 - ], - "Max": xxx, - "Min": xxx, - "Mean": xxx, - "Norm": xxx, - "requires_grad": true - }, - { - "type": "int", - "value": 3 - }, - { - "type": "int", - "value": 2 - }, - { - "type": "int", - "value": 1 - }, - { - "type": "int", - "value": 1 - } - ], - "input_kwargs": { - "ceil_mode": { - "type": "bool", - "value": false - }, - "return_indices": { - "type": "bool", - "value": false - }, - }, - "output": [ - { - "type": "torch.Tensor", - "dtype": "torch.float32", - "shape": [ - 1, - 64, - 7, - 7 - ], - "Max": xxx, - "Min": xxx, - "Mean": xxx, - "Norm": xxx, - "requires_grad": true - } - ] - } -``` + ```shell + msprobe -f pytorch compare -i ./compare.json -o ./output + ``` -初始名称为Functional.max_pool2d.0.forward,input_args是list,长度为5,第0项后面是Tensor,命名结束;第1-4项后面均是int,命名结束;按照顺序命名为 -``` -Functional.max_pool2d.0.forward.input.0 -Functional.max_pool2d.0.forward.input.1 -Functional.max_pool2d.0.forward.input.2 -Functional.max_pool2d.0.forward.input.3 -Functional.max_pool2d.0.forward.input.4 -``` -input_kwargs是dict,key是ceil_mode、return_indices,值均是bool,命名结束;命名为 -``` -Functional.max_pool2d.0.forward.input.ceil_mode -Functional.max_pool2d.0.forward.input.return_indices -``` -output是list,长度为1,第0项后面是Tensor,命名结束;按照顺序命名为 -``` -Functional.max_pool2d.0.forward.output.0 -``` -综上,生成的的op_name为 -``` -Functional.max_pool2d.0.forward.input.0 -Functional.max_pool2d.0.forward.input.1 -Functional.max_pool2d.0.forward.input.2 -Functional.max_pool2d.0.forward.input.3 -Functional.max_pool2d.0.forward.input.4 -Functional.max_pool2d.0.forward.input.ceil_mode -Functional.max_pool2d.0.forward.input.return_indices -Functional.max_pool2d.0.forward.output.0 -``` +4. 查看比对结果,请参见 [3 精度比对结果分析](#3-精度比对结果分析)。 ### 2.2 比对函数方式 @@ -278,13 +135,14 @@ compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_mat **参数说明**: -| 参数名 | 说明 | 是否必选 | -| ------------ |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | +| 参数名 | 说明 | 是否必选 | +|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | | input_param | 配置 dump 数据文件及目录,dict 类型。配置参数包括:
"npu_json_path":指定 NPU dump 目录下的 dump.json 文件。
**配置示例**:"npu_json_path": "./npu_dump/dump.json"。
"bench_json_path":指定 CPU、GPU 或 NPU dump 目录下的 dump.json 文件。
**配置示例**:"bench_json_path": "./bench_dump/dump.json"。
"stack_json_path":指定 NPU dump 目录下的 stack.json 文件。
**配置示例**:"stack_json_path": "./npu_dump/stack.json"。
"is_print_compare_log":配置是否开启单个算子的日志打屏。
**配置示例**:True 或 False。 | 是 | -| output_path | 配置比对结果文件存盘目录,str 类型。
**配置示例**:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 是 | -| stack_mode | 配置 stack_mode 的开关,bool 类型。仅当配置 stack_json_path 时需要,开启时比对结果呈现NPU_Stack_Info,关闭时不呈现。当不配置stack_json_path 时,自动识别是否存在stack.json,存在时呈现NPU_Stack_Info,否则不呈现。
**配置示例**:stack_mode=True,默认为 False。 | 否 | -| auto_analyze | 自动精度分析,bool 类型。开启后工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 advisor_{timestamp}.txt 文件)。
**配置示例**:auto_analyze=False,默认为 True。 | 否 | -| fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。
**配置示例**:fuzzy_match=True,默认为 False。 | 否 | +| output_path | 配置比对结果文件存盘目录,str 类型。
**配置示例**:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 是 | +| stack_mode | 配置 stack_mode 的开关,bool 类型。仅当配置 stack_json_path 时需要,开启时比对结果呈现NPU_Stack_Info,关闭时不呈现。当不配置stack_json_path 时,自动识别是否存在stack.json,存在时呈现NPU_Stack_Info,否则不呈现。
**配置示例**:stack_mode=True,默认为 False。 | 否 | +| auto_analyze | 自动精度分析,bool 类型。开启后工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 advisor_{timestamp}.txt 文件)。
**配置示例**:auto_analyze=False,默认为 True。 | 否 | +| fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。
**配置示例**:fuzzy_match=True,默认为 False。 | 否 | +| highlight | 高亮颜色标记。开启后,比对结果件中通过红色或黄色标记精度可疑API或模块。 开启高亮颜色标记后,比对性能降低,如果比对结果行数超出excel单页限制,程序强制关闭高亮颜色标记。
**配置示例**:highlight=True,默认为 False。 | 否 | **函数示例**: @@ -365,6 +223,7 @@ PyTorch 精度比对是以 CPU 或 GPU 的计算结果为标杆,通过计算 ### 3.2 颜色标记——真实数据模式、统计数据模式 +通过在命令行中配置-hl或--highlight开启,或者在比对函数中配置参数highlight=True开启,用于标记精度可疑API或模块。开启后,比对性能会有降低,建议比对较大dump.json文件时不配置此参数。 在比对结果中的Err_message列呈现比对结果颜色标记的原因,具体含义如下: 红色标记情况: @@ -437,7 +296,7 @@ MD5 模式: 4. MaxRelativeErr:当最大相对误差越接近 0 表示其计算的误差越小。 - 当 dump 数据中存在 0 或 Nan 时,比对结果中最大相对误差则出现 inf 或 Nan 的情况,属于正常现象。 + 当 dump 数据中存在 0 或 nan 时,比对结果中最大相对误差则出现 inf 或 nan 的情况,属于正常现象。 5. One Thousandth Err Ratio(相对误差小于千分之一的元素比例)、Five Thousandths Err Ratio(相对误差小于千分之五的元素比例)精度指标:是指 NPU 的 Tensor 中的元素逐个与对应的标杆数据对比,相对误差小于千分之一、千分之五的比例占总元素个数的比例。该数据仅作为精度下降趋势的参考,并不参与计算精度是否通过的判定。 @@ -516,4 +375,189 @@ compare_index: 6. Distributed.broadcast:输入为要广播的数据,输出为广播后的数据。 7. Distributed.isend:点对点通信,输入为要发送的数据,输出为发送的数据。 8. Distributed.irecv:点对点通信,输入为原数据,输出为接收的新数据。 -9. Distributed.all_to_all_single:输出数据为所有卡上的数据切分后合并的结果。 \ No newline at end of file +9. Distributed.all_to_all_single:输出数据为所有卡上的数据切分后合并的结果。 + +## 5 附录 + +### 5.1 比对文件 + + 以在当前目录创建 ./compare.json 为例。 + + - 单卡场景示例: + + ```json + { + "npu_path": "./npu_dump/dump.json", + "bench_path": "./bench_dump/dump.json", + "stack_path": "./npu_dump/stack.json", + "is_print_compare_log": true + } + ``` + + - 多卡场景示例: + + ```json + { + "npu_path": "./npu_dump/step0", # 需填写到step层级(rank的上一层级) + "bench_path": "./bench_dump/step0", # 需填写到step层级(rank的上一层级) + "is_print_compare_log": true + } + ``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------| +| npu_path | 配置NPU环境下的dump.json文件(单卡场景)或dump目录(多卡场景)。数据类型:str。 | 是 | +| bench_path | 配置CPU、GPU或NPU环境下的dump.json文件(单卡场景)或dump目录(多卡场景)。数据类型:str。 | 是 | +| stack_path | 配置NPU dump目录下的stack.json文件。数据类型:str。如果没有配置stack_path,命令行-s参数不生效,程序自动识别是否存在stack.json文件,如存在,则比对结果中呈现NPU_Stack_Info,如不存在,则不呈现。如果配置了stack_path,比对结果中是否呈现NPU_Stack_Info则通过命令行参数-s来控制。 | 否 | +| is_print_compare_log | 配置是否开启单个算子的日志打屏。可取值true或false,默认为true。关闭后则只输出常规日志。数据类型:bool。 | 否 | + +### 5.2 自定义映射文件 + +文件名格式:*.yaml,*为文件名,可自定义。 + +文件内容格式: + +```yaml +# API +{api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号}: {api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号} +# 模块 +{Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号}: {Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号} +``` + +冒号左侧和右侧分别为PyTorch框架不同版本或不同芯片环境的API的名称和module模块名称。 + +API和模块名称请从《[PyTorch 场景的精度数据采集](05.data_dump_PyTorch.md)》中的dump.json文件获取。 + +文件内容示例: + +```yaml +# API +NPU.npu_fusion_attention.4.forward.input.0: NPU.npu_fusion_attention.4.forward.input.0 +# 模块 +Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0: Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0 +``` + +当dump.json文件中存在“data_name”字段时,API和模块名称为data_name字段去掉文件后缀,如下图红框处所示: + +![pt_dump](./img/pt_dump.png) + +当dump.json文件中不存在“data_name”字段时,名称的拼写规则如下: + +input_args、input_kwargs和output使用统一的命名规则,当值是list类型时,名称后面添加'.{index}',当值类型是dict类型时,名称后面加'.{key}',当值类型是具体Tensor或null或int或float或bool或空list/dict等时,命名结束。 + +以下面api的dump文件为例: +```yaml + "Functional.max_pool2d.0.forward": { + "input_args": [ + { + "type": "torch.Tensor", + "dytpe": "torch_float32", + "shape": [ + 1, + 64, + 14, + 14 + ], + "Max": xxx, + "Min": xxx, + "Mean": xxx, + "Norm": xxx, + "requires_grad": true + }, + { + "type": "int", + "value": 3 + }, + { + "type": "int", + "value": 2 + }, + { + "type": "int", + "value": 1 + }, + { + "type": "int", + "value": 1 + } + ], + "input_kwargs": { + "ceil_mode": { + "type": "bool", + "value": false + }, + "return_indices": { + "type": "bool", + "value": false + }, + }, + "output": [ + { + "type": "torch.Tensor", + "dtype": "torch.float32", + "shape": [ + 1, + 64, + 7, + 7 + ], + "Max": xxx, + "Min": xxx, + "Mean": xxx, + "Norm": xxx, + "requires_grad": true + } + ] + } +``` + +初始名称为Functional.max_pool2d.0.forward,input_args是list,长度为5,第0项后面是Tensor,命名结束;第1-4项后面均是int,命名结束;按照顺序命名为 +``` +Functional.max_pool2d.0.forward.input.0 +Functional.max_pool2d.0.forward.input.1 +Functional.max_pool2d.0.forward.input.2 +Functional.max_pool2d.0.forward.input.3 +Functional.max_pool2d.0.forward.input.4 +``` +input_kwargs是dict,key是ceil_mode、return_indices,值均是bool,命名结束;命名为 +``` +Functional.max_pool2d.0.forward.input.ceil_mode +Functional.max_pool2d.0.forward.input.return_indices +``` +output是list,长度为1,第0项后面是Tensor,命名结束;按照顺序命名为 +``` +Functional.max_pool2d.0.forward.output.0 +``` +综上,生成的的op_name为 +``` +Functional.max_pool2d.0.forward.input.0 +Functional.max_pool2d.0.forward.input.1 +Functional.max_pool2d.0.forward.input.2 +Functional.max_pool2d.0.forward.input.3 +Functional.max_pool2d.0.forward.input.4 +Functional.max_pool2d.0.forward.input.ceil_mode +Functional.max_pool2d.0.forward.input.return_indices +Functional.max_pool2d.0.forward.output.0 +``` + +### 5.3 比对文件(单点数据) + + - 单卡场景示例: + + ```json + { + "npu_path": "./npu_dump/debug.json", + "bench_path": "./bench_dump/debug.json" + } + ``` + + - 多卡场景示例(step0目录下包含debug.json文件): + + ```json + { + "npu_path": "./npu_dump/step0", + "bench_path": "./bench_dump/step0" + } + ``` \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md index 55a148058a761f03b50b20ba635789e37241629f..dcd53eab47cd0a4ca0c42e15e5ea5e27361e9cb5 100644 --- a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md @@ -35,14 +35,15 @@ msprobe -f mindspore compare -i ./compare.json -o ./output -s **完整参数说明** -| 参数名 | 说明 | 是否必选 | -| -------------------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | -| -f 或 --framework | 指定训练框架。mindspore。 | 是 | +| 参数名 | 说明 | 是否必选 | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | +| -f 或 --framework | 指定训练框架。mindspore。 | 是 | | -i或--input_path | 指定比对文件。比对文件内容及示例请参见[比对文件](#41-比对文件)或[比对文件(kernel)](#42-比对文件kernel)(比对文件(kernel)仅[不同版本下的全量kernel比对](#23-不同版本下的全量kernel比对)场景支持)。 | 是 | | -o或--output_path | 配置比对结果文件存盘目录,默认会在当前目录创建output目录。文件名称基于时间戳自动生成,格式为:
`compare_result_{timestamp}.xlsx`
`compare_result_{rank_id}_{step_id}_{timestamp}.xlsx`(仅[不同版本下的全量kernel比对](#23-不同版本下的全量kernel比对)场景支持)。
提示:output目录下与结果件同名文件将被删除覆盖。 | 否 | | -s或--stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,需要使用[比对文件](#41-比对文件)的单卡场景配置stack_path指定stack.json文件,才能生成详细调用栈信息,否则在比对时会报错;暂不支持多卡场景。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | | -c或--compare_only | 仅比对开关,bool 类型。该参数默认未配置,会启用自动精度分析,工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 `advisor_{timestamp}.txt` 文件)。通过配置该参数取消自动精度分析,仅输出比对结果表格。 | 否 | | -f或--fuzzy_match | 模糊匹配。开启后,对于网络中同一层级且命名仅调用次数不同的API,可匹配并进行比对。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | +| -hl或--highlight | 高亮颜色标记。开启后,比对结果件中通过红色或黄色标记精度可疑API或模块。通过直接配置该参数开启,默认未配置,表示关闭。 开启高亮颜色标记后,比对性能降低,如果比对结果行数超出excel单页限制,程序强制关闭高亮颜色标记。 | 否 | | -am或--api_mapping | 跨框架比对。配置该参数时表示开启跨框架API比对功能,可以指定自定义映射文件*.yaml,不指定映射文件时按照msprobe定义的默认映射关系进行比对。自定义映射文件的格式请参见[自定义映射文件(api_mapping)](#43-自定义映射文件api_mapping)。仅[跨框架的API比对](#25-跨框架的api比对)场景需要配置。 | 否 | | -cm或--cell_mapping | 跨框架比对。配置该参数时表示开启跨框架cell模块比对功能,可以指定自定义映射文件*.yaml,不指定映射文件时按照msprobe定义的默认映射关系进行比对。自定义映射文件的格式请参见[自定义映射文件(cell_mapping)](#44-自定义映射文件cell_mapping)。仅[跨框架的cell模块比对](#26-跨框架的cell模块比对)场景需要配置。 | 否 | | -dm或--data_mapping | 同框架或跨框架比对。通过映射文件指定两个具体参数的对应关系,可以在L0、L1或mix采集场景下使用。配置该参数的同时需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(data_mapping)](#45-自定义映射文件data_mapping)。 | 否 | diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py index d35e1b51945c8f5a63f99edacadc3885570790ef..187f66468df6613fadc4453db58b43aa04a7762a 100644 --- a/debug/accuracy_tools/msprobe/mindspore/common/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py @@ -97,6 +97,9 @@ def save_tensor_as_npy(tensor, file_path): def convert_to_int(value): + if isinstance(value, bool): + logger.error('The value in rank_id or step should be int, please check!') + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) try: return int(value) except Exception: diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py index 1f4ad8939cd6945db3e7aeb619cb8530f1aa48af..4b509cc3327b40bba058fe3eaac20dedb89d5094 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py @@ -29,28 +29,30 @@ from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException from msprobe.core.common.exceptions import FileCheckException from msprobe.core.common.file_utils import check_file_or_directory_path, write_df_to_csv, create_directory, \ - check_path_before_create, load_npy -from msprobe.core.common.const import CompareConst, FileCheckConst + check_path_before_create, load_npy +from msprobe.core.common.const import CompareConst from msprobe.core.compare.npy_compare import compare_ops_apply from msprobe.core.compare.multiprocessing_compute import check_accuracy +from msprobe.mindspore.compare.utils import check_name_map_dict def common_dir_compare(input_params: Dict, output_dir: str) -> Optional[pd.DataFrame]: """ 高级目录比对函数,完全镜像输入目录结构 - + Args: input_params: 包含npu_path和bench_path的字典 output_dir: 输出根目录 - + Returns: 当输入目录是平铺npy文件时返回DataFrame,否则返回None """ npu_root = Path(input_params.get('npu_path')) bench_root = Path(input_params.get('bench_path')) name_map_dict = input_params.get('map_dict', {}) + check_name_map_dict(name_map_dict) file_tree = build_mirror_file_tree(npu_root, bench_root) - + # 处理文件比对 with ProcessPoolExecutor() as executor: results = list(tqdm( @@ -67,29 +69,29 @@ def common_dir_compare(input_params: Dict, output_dir: str) -> Optional[pd.DataF def process_directory_pair(item: Tuple[Path, Tuple[Path, Path]], name_map_dict: Dict, output_dir: str): """ 处理一个目录对 - + Args: item: (相对路径, (npu目录, bench目录))元组 output_dir: 输出根目录 - + Returns: 比对结果的DataFrame(仅平铺结构时返回) """ rel_path, (npu_dir, bench_dir) = item - + # 创建镜像输出目录 output_path = Path(output_dir) / rel_path create_directory(output_path) - + # 生成文件映射 npu_files = find_npy_files(npu_dir) bench_files = find_npy_files(bench_dir) map_dict = generate_map_dict(npu_files, bench_files, name_map_dict) - + if not map_dict: logger.warning(f"No file pairs found in {rel_path}") return None - + # 执行比对 result_df = do_multi_process(process_chunk, map_dict) check_path_before_create(output_path) @@ -103,16 +105,16 @@ def process_directory_pair(item: Tuple[Path, Tuple[Path, Path]], name_map_dict: def build_mirror_file_tree(npu_root: Path, bench_root: Path) -> Dict[Path, Tuple[Path, Path]]: """ 构建镜像文件树,键为相对路径,值为(npu_path, bench_path)元组 - + Args: npu_root: NPU数据根目录 bench_root: 基准数据根目录 - + Returns: 文件树字典 """ file_tree = {} - + # 遍历NPU目录构建树结构 # 使用os.walk遍历目录,限制深度为10层 for root, dirs, files in os.walk(npu_root): @@ -121,23 +123,23 @@ def build_mirror_file_tree(npu_root: Path, bench_root: Path) -> Dict[Path, Tuple if depth > 10: dirs.clear() # 清空dirs列表以阻止继续递归 continue - + # 检查当前目录下是否有npy文件 if any(f.endswith('.npy') for f in files): # 获取相对路径 dir_path = Path(root).relative_to(npu_root) npu_dir_pair = os.path.join(npu_root, dir_path) bench_dir_pair = os.path.join(bench_root, dir_path) - + try: check_file_or_directory_path(bench_dir_pair, isdir=True) except FileCheckException: continue - + # 添加到文件树 if dir_path not in file_tree: file_tree[dir_path] = (npu_dir_pair, bench_dir_pair) - + return file_tree @@ -160,13 +162,13 @@ def find_npy_files(directory): file_name = base_name[0] logger.info(f"Generating file info for file: {file}") - + # 使用一致的分割逻辑 file_ele = file_name.split('_') - + if len(file_ele) < 2: continue - + key = '_'.join(file_ele[:-2]) if key: # 文件的完整路径 @@ -210,14 +212,14 @@ def do_multi_process(func, map_dict): df_chunks = [result_df] process_num = 1 logger.info(f"Using {process_num} processes with chunk size {df_chunk_size}") - + # 分割字典 map_chunks = split_dict(map_dict, df_chunk_size) - + # 创建结果列表和进程池 results = [] pool = multiprocessing.Pool(process_num) - + progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100) def update_progress(size, progress_lock, extra_param=None): @@ -230,6 +232,7 @@ def do_multi_process(func, map_dict): pool.close() except OSError as e: logger.error(f'pool terminate failed: {str(e)}') + results = [] try: # 提交任务到进程池 diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py index fa8b68070945f08c0a18d2fc2c142b05de8707fe..5064bedcdb8d65aa4406b77e5e8ae46696faf4d7 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py @@ -13,47 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from msprobe.core.common.utils import CompareException from msprobe.core.common.file_utils import create_directory from msprobe.core.common.exceptions import FileCheckException from msprobe.mindspore.common.log import logger from msprobe.mindspore.compare.ms_compare import ms_compare -from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json +from msprobe.core.compare.utils import compare_distributed_inner from msprobe.mindspore.compare.ms_graph_compare import GraphMSComparator def ms_compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): - if kwargs.get('suffix'): - logger.error("Argument 'suffix' is not supported for compare_distributed.") - raise CompareException(CompareException.INVALID_PARAM_ERROR) - is_print_compare_log = kwargs.get('is_print_compare_log', True) - # get the ranks and match by order - npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) - bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) - if len(npu_ranks) != len(bench_ranks): - logger.error('The number of ranks in the two runs are different. ' - 'Unable to match the ranks. Please use another folder to compare ' - 'or use compare() api and manually match the ranks.') - raise CompareException(CompareException.INVALID_PATH_ERROR) - for nr, br in zip(npu_ranks, bench_ranks): - npu_data_dir = os.path.join(npu_dump_dir, nr) - bench_data_dir = os.path.join(bench_dump_dir, br) - npu_path = extract_json(npu_data_dir, stack_json=False) - bench_path = extract_json(bench_data_dir, stack_json=False) - - dump_result_param = { - 'npu_json_path': npu_path, - 'bench_json_path': bench_path, - 'is_print_compare_log': is_print_compare_log - } - try: - ms_compare(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}', **kwargs) - except CompareException as e: - if e.code == CompareException.INVALID_DATA_ERROR: - logger.error(f"Invalid or missing 'data' in dump.json. Skipping {nr} comparison.") - if e.code == CompareException.INVALID_TASK_ERROR: - logger.error(f"Invalid or missing 'task' in dump.json. Skipping {nr} comparison.") + compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, ms_compare, **kwargs) def ms_graph_compare(inputs, outputs): diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index dd2c6f8c103337498e037db00f65911329b2621d..ae3dfa63d78b2b7e4553a4f68df90aa84dc362ea 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -35,7 +35,16 @@ def ms_compare(input_param, output_path, **kwargs): config.data_mapping = generate_data_mapping_by_layer_mapping(input_param, config.layer_mapping, output_path) is_cross_framework = check_cross_framework(input_param.get('bench_json_path')) - mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, config.dump_mode) + + config_dict = { + 'stack_mode': config.stack_mode, + 'auto_analyze': config.auto_analyze, + 'fuzzy_match': config.fuzzy_match, + 'highlight': config.highlight, + 'dump_mode': config.dump_mode, + 'compared_file_type': config.compared_file_type + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig(config.cell_mapping, config.api_mapping, config.data_mapping) ms_comparator = Comparator(read_real_data, mode_config, mapping_config, is_cross_framework) ms_comparator.compare_core(input_param, output_path, suffix=config.suffix) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py index ecf8e84d136fdfcbcab6372e45099b1c931900ea..53149e69f5db456a5fa86e100a8195ec3f1b7097 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py @@ -168,8 +168,13 @@ class GraphMSComparator: self.output_path = output_path self.base_npu_path = input_param.get('npu_path', None) self.base_bench_path = input_param.get('bench_path', None) - self.rank_list = [convert_to_int(rank_id) for rank_id in input_param.get('rank_id', [])] - self.step_list = [convert_to_int(step_id) for step_id in input_param.get('step_id', [])] + rank_id_list = input_param.get('rank_id', []) + step_id_list = input_param.get('step_id', []) + if not isinstance(rank_id_list, list) or not isinstance(step_id_list, list): + logger.error("'rank_id' and 'step_id' should both be lists, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) + self.rank_list = [convert_to_int(rank_id) for rank_id in rank_id_list] + self.step_list = [convert_to_int(step_id) for step_id in step_id_list] # split by rank and step, generate rank step path self.npu_rank_step_dict = self.generate_rank_step_path(self.base_npu_path) self.bench_rank_step_dict = self.generate_rank_step_path(self.base_bench_path) @@ -291,20 +296,8 @@ class GraphMSComparator: compare_result_df = self.do_multi_process(compare_result_df, mode) compare_result_name = add_time_with_xlsx(f"compare_result_{str(rank_id)}_{str(step_id)}") compare_result_path = os.path.join(os.path.realpath(self.output_path), f"{compare_result_name}") - self.to_excel(compare_result_df, compare_result_path) - logger.info(f"Compare rank: {rank_id} step: {step_id} finish. Compare result: {compare_result_path}.") - - def to_excel(self, compare_result_df: pd.DataFrame, compare_result_path: str, slice_num=0, need_slice=False) -> int: - size = len(compare_result_df) - # sheet size cannot be larger than 1048576 - if size < CompareConst.MAX_EXCEL_LENGTH: - compare_result_path = compare_result_path.replace('.xlsx', f'_slice_{slice_num}.xlsx') if \ - need_slice else compare_result_path save_excel(compare_result_path, compare_result_df) - return slice_num + 1 - else: - slice_num = self.to_excel(compare_result_df.iloc[0: size // 2], compare_result_path, slice_num, True) - return self.to_excel(compare_result_df.iloc[size // 2:], compare_result_path, slice_num, True) + logger.info(f"Compare rank: {rank_id} step: {step_id} finish. Compare result: {compare_result_path}.") def compare_process(self, rank_id, step_id): # generate data_path @@ -326,7 +319,7 @@ class GraphMSComparator: bench_data_list.extend(data_list) if npu_mode == GraphMode.ERROR_MODE or bench_mode == GraphMode.ERROR_MODE: - logger.warning(f"Data_path {npu_data_path} or {bench_data_path} is not exist.") + logger.warning(f"Data path: npu_data_path or bench_data_path does not exist.") return [], '' if npu_mode != bench_mode: logger.error(f"NPU mode {npu_mode} not equal to MATCH mode {bench_mode}.") diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/utils.py b/debug/accuracy_tools/msprobe/mindspore/compare/utils.py index 7a9c78e8f74426c23982723fcf90f729fc9e694c..a6f9f4ae55a656c269509fc479f476aa8b9251b9 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/utils.py @@ -17,7 +17,8 @@ import os from msprobe.core.common.const import Const from msprobe.core.common.file_utils import load_npy, FileChecker, FileCheckConst -from msprobe.core.common.utils import detect_framework_by_dump_json +from msprobe.core.common.utils import detect_framework_by_dump_json, CompareException, check_op_str_pattern_valid +from msprobe.core.common.log import logger def read_npy_data(dir_path, file_name): @@ -35,3 +36,10 @@ def read_npy_data(dir_path, file_name): def check_cross_framework(bench_json_path): framework = detect_framework_by_dump_json(bench_json_path) return framework == Const.PT_FRAMEWORK + + +def check_name_map_dict(name_map_dict): + if not isinstance(name_map_dict, dict): + logger.error("'map_dict' should be a dict, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) + check_op_str_pattern_valid(str(name_map_dict)) diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py index b706a7544506b723a0c366866cec490eb5a4ff5f..6f8ad5cf60924581f9c112e1cb236f51f255a1dd 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py @@ -13,43 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -from msprobe.core.common.utils import CompareException -from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json -from msprobe.pytorch.common.log import logger +from msprobe.core.compare.utils import compare_distributed_inner from msprobe.pytorch.compare.pt_compare import compare def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): - if kwargs.get("suffix"): - logger.error("Argument 'suffix' is not supported for compare_distributed.") - raise CompareException(CompareException.INVALID_PARAM_ERROR) - is_print_compare_log = kwargs.get("is_print_compare_log", True) - # get the ranks and match by order - npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) - bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) - if len(npu_ranks) != len(bench_ranks): - logger.error( - "The number of ranks in the two runs are different. " - "Unable to match the ranks. " - "Please use another folder to compare or use compare() api and manually match the ranks.") - raise CompareException(CompareException.INVALID_PATH_ERROR) - for nr, br in zip(npu_ranks, bench_ranks): - npu_data_dir = os.path.join(npu_dump_dir, nr) - bench_data_dir = os.path.join(bench_dump_dir, br) - npu_path = extract_json(npu_data_dir, stack_json=False) - bench_path = extract_json(bench_data_dir, stack_json=False) - - dump_result_param = { - "npu_json_path": npu_path, - "bench_json_path": bench_path, - "is_print_compare_log": is_print_compare_log - } - try: - compare(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}', **kwargs) - except CompareException as e: - if e.code == CompareException.INVALID_DATA_ERROR: - logger.error(f"Invalid or missing 'data' in dump.json. Skipping {nr} comparison.") - if e.code == CompareException.INVALID_TASK_ERROR: - logger.error(f"Invalid or missing 'task' in dump.json. Skipping {nr} comparison.") + compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, compare, **kwargs) diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py index 8acaf70c3e078c0c259cf64fe97dca63704cacb5..b3be3e793df30ada65a98374eae4f358da433fd3 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from msprobe.core.common.utils import CompareException +from msprobe.core.common.log import logger from msprobe.core.compare.acc_compare import Comparator, ModeConfig, MappingConfig, setup_comparison from msprobe.pytorch.compare.utils import read_pt_data @@ -24,10 +26,21 @@ def read_real_data(npu_dir, npu_data_name, bench_dir, bench_data_name, _) -> tup def compare(input_param, output_path, **kwargs): + if not isinstance(input_param, dict): + logger.error("input_param should be dict, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) config = setup_comparison(input_param, output_path, **kwargs) - mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, config.dump_mode, - config.first_diff_analyze) + config_dict = { + 'stack_mode': config.stack_mode, + 'auto_analyze': config.auto_analyze, + 'fuzzy_match': config.fuzzy_match, + 'highlight': config.highlight, + 'dump_mode': config.dump_mode, + 'first_diff_analyze': config.first_diff_analyze, + 'compared_file_type': config.compared_file_type + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig(data_mapping=config.data_mapping) pt_comparator = Comparator(read_real_data, mode_config, mapping_config) pt_comparator.compare_core(input_param, output_path, suffix=config.suffix) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py index e6b2bbf0594959578d740e0170ed3c1fd92d1aa3..4916fe6ce0a1f02967f02bf66aa38112660eedec 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py @@ -54,6 +54,7 @@ from msprobe.core.common.utils import (CompareException, is_json_file, detect_framework_by_dump_json, is_save_variable_valid, + get_file_type, check_dump_json_key) from msprobe.core.common.decorator import recursion_depth_decorator @@ -220,23 +221,49 @@ class TestUtils(TestCase): } input_param["npu_json_path"] = "npu_path" - with patch("msprobe.core.common.utils.load_json", return_value=npu_json): + with patch("msprobe.core.common.utils.load_json", return_value=npu_json), \ + patch("msprobe.core.common.utils.get_file_type", return_value=Const.DUMP_JSON_FILE): dump_mode = get_dump_mode(input_param) self.assertEqual(dump_mode, Const.ALL) npu_json["task"] = Const.STATISTICS with patch("msprobe.core.common.utils.load_json", return_value=npu_json), \ - patch("msprobe.core.common.utils.md5_find", return_value=True): + patch("msprobe.core.common.utils.md5_find", return_value=True), \ + patch("msprobe.core.common.utils.get_file_type", return_value=Const.DUMP_JSON_FILE): dump_mode = get_dump_mode(input_param) self.assertEqual(dump_mode, Const.MD5) npu_json["task"] = Const.OVERFLOW_CHECK - with patch("msprobe.core.common.utils.load_json", return_value=npu_json): + with patch("msprobe.core.common.utils.load_json", return_value=npu_json), \ + patch("msprobe.core.common.utils.get_file_type", return_value=Const.DUMP_JSON_FILE): with self.assertRaises(CompareException) as context: dump_mode = get_dump_mode(input_param) self.assertEqual(context.exception.code, CompareException.INVALID_TASK_ERROR) mock_error.assert_called_with("Compare applies only to task is tensor or statistics") + def test_get_file_type(self): + # 测试有效的 file_path (dump.json) + file_path = 'path/to/dump.json' + expected_file_type = Const.DUMP_JSON_FILE + self.assertEqual(get_file_type(file_path), expected_file_type) + + # 测试有效的 file_path (debug.json) + file_path = 'path/to/debug.json' + expected_file_type = Const.DEBUG_JSON_FILE + self.assertEqual(get_file_type(file_path), expected_file_type) + + # 测试无效的 file_path + file_path = 'path/to/unknown.json' + with self.assertRaises(CompareException) as context: + get_file_type(file_path) + self.assertEqual(context.exception.code, CompareException.INVALID_PATH_ERROR) + + # 测试非字符串类型的 file_path + file_path = 12345 # 非字符串类型 + with self.assertRaises(CompareException) as context: + get_file_type(file_path) + self.assertEqual(context.exception.code, CompareException.INVALID_PATH_ERROR) + @patch('msprobe.core.common.file_utils.get_file_content_bytes') def test_get_json_contents_should_raise_exception(self, mock_get_file_content_bytes): mock_get_file_content_bytes.return_value = 'not a dict' diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py index ee15d9b06e530f32c5759492a9de40a2ab9cbf46..1f7c515a59b5c13edcae4e890e7054d55984280f 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py @@ -343,31 +343,36 @@ class TestUtilsMethods(unittest.TestCase): op_name = 'Functional.linear.0.forward' stack_json_data = {'Functional.linear.0.forward': ['File']} merge_list = { + 'debug_struct': [], 'input_struct': [('torch.float32', [2, 2])], 'op_name': ['Functional.linear.0.forward.input.0'], 'output_struct': [], 'params_struct': [], 'params_grad_struct': [], 'stack_info': [['File']], - 'summary': [[1, 1, 1, 1]] + 'summary': [[1, 1, 1, 1]], + 'state': ['input'] } - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': True, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) result = ParseData(mode_config).gen_merge_list(json_data, op_name, stack_json_data) self.assertEqual(result, merge_list) def test_check_op_item_fuzzy(self): - stack_mode = False - auto_analyze = True - dump_mode = Const.SUMMARY - - fuzzy_match = True - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': True, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) @@ -380,11 +385,13 @@ class TestUtilsMethods(unittest.TestCase): file_list = [os.path.join(base_dir, 'dump.json'), os.path.join(base_dir, 'dump.json'), os.path.join(base_dir, 'stack.json')] - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': True, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() from msprobe.pytorch.compare.pt_compare import read_real_data @@ -393,10 +400,10 @@ class TestUtilsMethods(unittest.TestCase): o_data = [ ['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', '[2, 2]', '[2, 2]', 0, 0, 0, 0, '0.0%', 'N/A', '0.0%', '0.0%', - 2, 0, 1, 1, 2, 0, 1, 1, '', '', ['File'] + 2, 0, 1, 1, 2, 0, 1, 1, '', '', ['File'], 'input', 'Functional.linear.0.forward' ] ] - columns = CompareConst.SUMMARY_COMPARE_RESULT_HEADER + ['NPU_Stack_Info'] + columns = CompareConst.SUMMARY_COMPARE_RESULT_HEADER + ['NPU_Stack_Info'] + ['state', 'api_origin_name'] o_result = pd.DataFrame(o_data, columns=columns, dtype=object) self.assertTrue(np.array_equal(result.to_numpy(), o_result.to_numpy())) @@ -425,8 +432,8 @@ class TestParseData(unittest.TestCase): npu_df, bench_df = parse_data.parse(file_list) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File']]], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward']], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name'] ) self.assertTrue(npu_df.equals(target_df)) self.assertTrue(bench_df.equals(target_df)) @@ -443,8 +450,8 @@ class TestParseData(unittest.TestCase): npu_df = parse_data.gen_data_df(npu_json_data, stack_json_data) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File']]], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward']], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name'] ) self.assertTrue(npu_df.equals(target_df)) @@ -460,8 +467,8 @@ class TestParseData(unittest.TestCase): npu_df = parse_data.gen_data_df(npu_json_data, stack_json_data) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'Functional.linear.0.forward.input.0.pt']], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'data_name'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward', 'Functional.linear.0.forward.input.0.pt']], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'data_name'] ) self.assertTrue(npu_df.equals(target_df)) @@ -477,8 +484,8 @@ class TestParseData(unittest.TestCase): npu_df = parse_data.gen_data_df(npu_json_data, stack_json_data) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 123456]], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'md5'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward', 123456]], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'md5'] ) self.assertTrue(npu_df.equals(target_df)) @@ -494,13 +501,15 @@ class TestParseData(unittest.TestCase): merge_list = parse_data.gen_merge_list(npu_json_data, 'Functional.linear.0.forward', stack_json_data) target_dict = { + 'debug_struct': [], 'input_struct': [('torch.float32', [2, 2])], 'op_name': ['Functional.linear.0.forward.input.0'], 'output_struct': [], 'params_grad_struct': [], 'params_struct': [], 'stack_info': [['File']], - 'summary': [[2, 0, 1, 1]] + 'summary': [[2, 0, 1, 1]], + 'state': ['input'] } self.assertEqual(merge_list, target_dict) @@ -670,13 +679,14 @@ class TestMatch(unittest.TestCase): match = Match(mode_config, mapping_config, cross_frame=False) match_result = pd.DataFrame(columns=CompareConst.MATCH_RESULT_COLUMNS) - npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2]], - index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x', + npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2]], + index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', + 'state_x', 'api_origin_name_x', 'data_name_x', 'compare_key', 'compare_shape'] ) match_result = match.put_unmatched_in_table(match_result, npu_op_item) - target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2], - 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']], + target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2], + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']], columns=CompareConst.MATCH_RESULT_COLUMNS) self.assertTrue(match_result.equals(target_match_result)) @@ -686,17 +696,17 @@ class TestMatch(unittest.TestCase): match = Match(mode_config, mapping_config, cross_frame=False) match_result = pd.DataFrame(columns=CompareConst.MATCH_RESULT_COLUMNS) - npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2]], - index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x', + npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2]], + index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'state_x', 'api_origin_name_x', 'data_name_x', 'compare_key', 'compare_shape'] ) - bench_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2]], - index=['op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'data_name_y', + bench_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2]], + index=['op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'state_y', 'api_origin_name_y', 'data_name_y', 'compare_key', 'compare_shape'] ) match_result = match.put_matched_in_table(match_result, npu_op_item, bench_op_item) - target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2], - 'op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name']], + target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2], + 'op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name']], columns=CompareConst.MATCH_RESULT_COLUMNS) self.assertTrue(match_result.equals(target_match_result)) @@ -739,30 +749,32 @@ class TestMatch(unittest.TestCase): match = Match(mode_config, mapping_config, cross_frame=False) npu_df = pd.DataFrame([ - ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2]], - ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2]] - ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'data_name', 'compare_key', 'compare_shape']) + ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.3.forward','Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2]], + ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.1.forward', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2]] + ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'data_name', 'compare_key', 'compare_shape']) bench_df = pd.DataFrame([ - ['Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.0.forward.input.0.pt', 'Functional.conv2d.0.forward.input.0', [1, 2]], - ['Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.0.forward.input.0', [1, 2]] - ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'data_name', 'compare_key', 'compare_shape']) + ['Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.0.forward', 'Functional.conv2d.0.forward.input.0.pt', 'Functional.conv2d.0.forward.input.0', [1, 2]], + ['Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.0.forward', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.0.forward.input.0', [1, 2]] + ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'data_name', 'compare_key', 'compare_shape']) match_result = match.process_fuzzy_match(npu_df, bench_df) expected = pd.DataFrame( [ - ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2], 'Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.0.forward.input.0.pt'], - ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2], 'Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt'] + ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.3.forward', 'Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2], 'Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.0.forward', 'Functional.conv2d.0.forward.input.0.pt'], + ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.1.forward', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2], 'Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.0.forward', 'Functional.amax.0.forward.input.0.pt'] ] , columns=CompareConst.MATCH_RESULT_COLUMNS) self.assertTrue(match_result.equals(expected)) def test_match_op_both_last_element(self): - stack_mode = False - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) @@ -771,11 +783,13 @@ class TestMatch(unittest.TestCase): self.assertEqual(b, 0) def test_match_op_only_npu_last_element(self): - stack_mode = False - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) @@ -784,11 +798,13 @@ class TestMatch(unittest.TestCase): self.assertEqual(b, 0) def test_match_op_only_bench_last_element(self): - stack_mode = False - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 6265e31cfccbd8a741435250de9438c02374e721..173fb550067de76b77524699430691fa87df6c58 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -13,8 +13,7 @@ from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, get_rela_diff_summary_mode, merge_tensor, op_item_parse, read_op, result_item_init, \ - stack_column_process, table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, \ - gen_op_item, ApiBatch + stack_column_process, table_value_is_valid, reorder_op_name_list, reorder_op_x_list, gen_op_item, ApiBatch # test_read_op_1 op_data = { @@ -32,15 +31,15 @@ op_name = "Tensor.add_0.0.forward" op_result = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'data_name': '-1', - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.input.0'}, + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.forward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, 'data_name': '-1', - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.forward.input.1'}, + 'Norm': 0.02844562754034996, 'requires_grad': 'False', 'full_op_name': 'Tensor.add_0.0.forward.input.1', 'state': 'input'}, {'full_op_name': 'Tensor.add_0.0.forward.input.alpha', 'dtype': "", 'shape': '[]', 'md5': '0dae4479', - 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'type': 'float', 'value': -0.1}, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'type': 'float', 'value': -0.1, 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'data_name': '-1', - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.output.0'}] + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.forward.output.0', 'state': 'output'}] # test_read_op_1 op_data_b = { @@ -57,13 +56,13 @@ op_name_b = "Tensor.add_0.0.backward" op_result_b = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.input.0'}, + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.backward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.backward.input.1'}, + 'Norm': 0.02844562754034996, 'requires_grad': 'False', 'full_op_name': 'Tensor.add_0.0.backward.input.1', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.output.0'}] + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.backward.output.0', 'state': 'output'}] # test_op_item_parse parse_item = [ @@ -77,14 +76,14 @@ parse_index = None parse_item_list = None parse_top_bool = True o_result_parse = [ - {'Max': 4097.0, 'Mean': 820.2, 'Min': 0.0, 'Norm': 4097.0, 'dtype': 'torch.int64', 'requires_grad': False, + {'Max': 4097.0, 'Mean': 820.2, 'Min': 0.0, 'Norm': 4097.0, 'dtype': 'torch.int64', 'requires_grad': 'False', 'shape': [5], 'type': 'torch.Tensor', 'full_op_name': 'Distributed.broadcast.0.forward.input.0', - 'data_name': '-1', 'md5': '00000000'}, + 'data_name': '-1', 'md5': '00000000', 'state': 'input'}, {'full_op_name': 'Distributed.broadcast.0.forward.input.1', 'dtype': "", 'shape': '[]', - 'md5': 'f4dbdf21', 'Max': 0, 'Min': 0, 'Mean': 0, 'Norm': 0, 'data_name': '-1', 'type': 'int', 'value': 0}, + 'md5': 'f4dbdf21', 'Max': 0, 'Min': 0, 'Mean': 0, 'Norm': 0, 'data_name': '-1', 'type': 'int', 'value': 0, 'state': 'input'}, {'Max': None, 'Mean': None, 'Min': None, 'Norm': None, 'data_name': '-1', 'dtype': 'slice', 'type': 'slice', 'full_op_name': 'Distributed.broadcast.0.forward.input.2', 'md5': '5fbbe87f', 'shape': '(3,)', - 'value': [None, None, None]} + 'value': [None, None, None], 'state': 'input'} ] # test_resolve_api_special_parameters @@ -255,15 +254,15 @@ o_result_unmatch_3 = [ tensor_list = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'Norm': 2.2533628940582275, 'requires_grad': True, - 'full_op_name': 'Tensor.add_.0.forward.input.0'}, + 'full_op_name': 'Tensor.add_.0.forward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.1'}, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.1', 'state': 'input'}, {'full_op_name': 'Tensor.add_.0.forward.input.alpha.0', 'dtype': "", "shape": '[]', 'md5': None, - 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1'}, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0'} + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0', 'state': 'output'} ] result_op_dict = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_.0.forward.input.1', 'Tensor.add_.0.forward.input.alpha.0', 'Tensor.add_.0.forward.output.0'], @@ -272,22 +271,24 @@ result_op_dict = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_.0.fo 'output_struct': [('torch.float32', [16, 1, 3, 3])], 'params_struct': [], 'params_grad_struct': [], + 'debug_struct': [], 'summary': [[0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275], [0.003992878366261721, -0.008102823048830032, -0.0002002553956117481, 0.02844562754034996], [-0.1, -0.1, -0.1, -0.1], [0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275]], - 'stack_info': []} + 'stack_info': [], + 'state': ['input', 'input', 'input', 'output']} tensor_list_md5 = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.0', 'md5': 1}, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.0', 'md5': 1, 'state': 'input'}, {'full_op_name': 'Tensor.add_.0.forward.kwargs.alpha.0', 'dtype': "", "shape": '[]', 'md5': None, - 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1'}, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0', 'md5': 2} + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0', 'md5': 2, 'state': 'output'} ] result_op_dict_md5 = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_.0.forward.kwargs.alpha.0', 'Tensor.add_.0.forward.output.0'], @@ -295,18 +296,20 @@ result_op_dict_md5 = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_. 'output_struct': [('torch.float32', [16, 1, 3, 3], 2)], 'params_struct': [], 'params_grad_struct': [], + 'debug_struct': [], 'summary': [ [0.003992878366261721, -0.008102823048830032, -0.0002002553956117481, 0.02844562754034996], [-0.1, -0.1, -0.1, -0.1], [0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275]], - 'stack_info': []} + 'stack_info': [], + 'state': ['input', 'input', 'output']} base_dir1 = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'test_acc_compare_utils1') base_dir2 = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'test_acc_compare_utils2') def create_json_files(base_dir): - file_names = ['dump.json', 'stack.json', 'construct.json'] + file_names = ['dump.json', 'stack.json', 'construct.json', 'debug.json'] for file_name in file_names: file_path = os.path.join(base_dir, file_name) @@ -339,12 +342,15 @@ class TestUtilsMethods(unittest.TestCase): def test_extract_json_1(self): create_json_files(base_dir1) - result = extract_json(base_dir1, stack_json=False) + result = extract_json(base_dir1, Const.DUMP_JSON_FILE) self.assertEqual(result, os.path.join(base_dir1, 'dump.json')) - result = extract_json(base_dir1, stack_json=True) + result = extract_json(base_dir1, Const.STACK_JSON_FILE) self.assertEqual(result, os.path.join(base_dir1, 'stack.json')) + result = extract_json(base_dir1, Const.DEBUG_JSON_FILE) + self.assertEqual(result, os.path.join(base_dir1, 'debug.json')) + def test_check_and_return_dir_contents(self): create_rank_dirs(base_dir2) result = check_and_return_dir_contents(base_dir2, 'rank') @@ -359,12 +365,12 @@ class TestUtilsMethods(unittest.TestCase): self.assertEqual(result, op_result_b) def test_op_item_parse(self): - result = op_item_parse(parse_item, parse_op_name) + result = op_item_parse(parse_item, parse_op_name, 'input') self.assertEqual(result, o_result_parse) def test_op_item_parse_max_depth(self): with self.assertRaises(CompareException) as context: - op_item_parse(parse_item, parse_op_name, depth=11) + op_item_parse(parse_item, parse_op_name, 'input', depth=11) self.assertEqual(context.exception.code, CompareException.RECURSION_LIMIT_ERROR) def test_get_rela_diff_summary_mode_float_or_int(self): @@ -550,57 +556,34 @@ class TestUtilsMethods(unittest.TestCase): self.assertFalse(result) -class TestGetNameAndState(unittest.TestCase): - def test_valid_forward_input(self): - name = 'conv2d.forward.1.input.0' - expected_api = 'conv2d.forward.1.' - expected_state = 'input' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_valid_backward_output(self): - name = 'Functional.pad.0.backward.output.0' - expected_api = 'Functional.pad.0.backward.' - expected_state = 'output' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_valid_with_kwargs(self): - name = 'layer.norm.2.forward.kwargs.attr' - expected_api = 'layer.norm.2.forward.' - expected_state = 'kwargs' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_no_numeric_index(self): - name = 'conv2d.forward.input.0' - expected_api = 'conv2d.forward.' - expected_state = 'input' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_invalid__state(self): - name = 'conv2d.forward.1.invalidstate.0' - with self.assertRaises(CompareException) as context: - get_name_and_state(name) - self.assertIn('Invalid name string', str(context.exception.code)) - - class TestReorderOpNameList(unittest.TestCase): def test_reorder_op_name_list(self): # 标准顺序 op_name_list = ["op.forward.input.0.0", "op.forward.output.0", "op.forward.output.1", "op.forward.parameters.1", "op.forward.parameters.2", "op.parameters_grad.0"] - result = reorder_op_name_list(op_name_list) - expected = ["op.forward.input.0.0", "op.forward.parameters.1", "op.forward.parameters.2", "op.forward.output.0", "op.forward.output.1", "op.parameters_grad.0"] - self.assertEqual(result, expected) + state_list = ["input", "output", "output", "parameters", "parameters", "parameters_grad"] + op_name_reorder, state_reorder = reorder_op_name_list(op_name_list, state_list) + expected_result = ["op.forward.input.0.0", "op.forward.parameters.1", "op.forward.parameters.2", "op.forward.output.0", "op.forward.output.1", "op.parameters_grad.0"] + expected_state = ["input", "parameters", "parameters", "output", "output", "parameters_grad"] + self.assertEqual(op_name_reorder, expected_result) + self.assertEqual(state_reorder, expected_state) # 只有输入元素 op_name_list = ["op.forward.input.0", "op.forward.input.1"] - result = reorder_op_name_list(op_name_list) - expected = ["op.forward.input.0", "op.forward.input.1"] - self.assertEqual(result, expected) + state_list = ["input", "input"] + op_name_reorder, state_reorder = reorder_op_name_list(op_name_list, state_list) + expected_result = ["op.forward.input.0", "op.forward.input.1"] + expected_state = ["input", "input"] + self.assertEqual(op_name_reorder, expected_result) + self.assertEqual(state_reorder, expected_state) # 输入为空 op_name_list = [] - result = reorder_op_name_list(op_name_list) - expected = [] - self.assertEqual(result, expected) + state_list = [] + op_name_reorder, state_reorder = reorder_op_name_list(op_name_list, state_list) + expected_result = [] + expected_state = [] + self.assertEqual(op_name_reorder, expected_result) + self.assertEqual(state_reorder, expected_state) class TestReorderOpXList(unittest.TestCase): @@ -609,37 +592,45 @@ class TestReorderOpXList(unittest.TestCase): op_name_list = ["op.forward.input.0", "op.forward.output.0", "op.forward.parameters.weight"] summary_list = ["summary1", "summary2", "summary3"] data_name_list = ["data1", "data2", "data3"] - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = ["input", "output", "parameters"] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, ["op.forward.input.0", "op.forward.parameters.weight", "op.forward.output.0"]) self.assertEqual(result_summary, ["summary1", "summary3", "summary2"]) self.assertEqual(result_data_name, ["data1", "data3", "data2"]) + self.assertEqual(result_state, ["input", "parameters", "output"]) # 空 op_name_list 或 summary_list op_name_list = [] summary_list = [] data_name_list = ["data1", "data2", "data3"] - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = [] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, []) self.assertEqual(result_summary, []) self.assertEqual(result_data_name, ["data1", "data2", "data3"]) + self.assertEqual(result_state, []) # 空 data_name_list op_name_list = ["op.forward.input.0", "op.forward.output.0", "op.forward.parameters.weight"] summary_list = ["summary1", "summary2", "summary3"] data_name_list = [] - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = ["input", "output", "parameters"] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, ["op.forward.input.0", "op.forward.parameters.weight", "op.forward.output.0"]) self.assertEqual(result_summary, ["summary1", "summary3", "summary2"]) self.assertEqual(result_data_name, []) + self.assertEqual(result_state, ["input", "parameters", "output"]) # data_name_list 为 None op_name_list = ["op.forward.input.0", "op.forward.output.0", "op.forward.parameters.weight"] summary_list = ["summary1", "summary2", "summary3"] data_name_list = None - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = ["input", "output", "parameters"] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, ["op.forward.input.0", "op.forward.parameters.weight", "op.forward.output.0"]) self.assertEqual(result_summary, ["summary1", "summary3", "summary2"]) self.assertEqual(result_data_name, None) + self.assertEqual(result_state, ["input", "parameters", "output"]) class TestGenOpItem(unittest.TestCase): @@ -657,7 +648,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['data_name'], 'test_data') self.assertEqual(result['full_op_name'], 'test_data') @@ -668,6 +659,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Mean'], 2) self.assertEqual(result['Norm'], 2) self.assertEqual(result['md5'], f"{zlib.crc32(str(op_data['value']).encode()):08x}") + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_empty_data_name(self): op_data = { @@ -677,11 +669,12 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') # data_name为空时,应该被设置为'-1' self.assertEqual(result['data_name'], '-1') self.assertEqual(result['full_op_name'], op_name) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_none_data_name(self): op_data = { @@ -691,11 +684,12 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') # data_name为None时,应该被设置为'-1' self.assertEqual(result['data_name'], '-1') self.assertEqual(result['full_op_name'], op_name) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_torch_size(self): op_data = { @@ -705,7 +699,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'torch.Size') self.assertEqual(result['shape'], '[2, 3, 4]') @@ -713,6 +707,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Min'], None) self.assertEqual(result['Mean'], None) self.assertEqual(result['Norm'], None) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_slice(self): op_data = { @@ -722,10 +717,11 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'slice') self.assertEqual(result['shape'], str(np.shape(np.array(op_data['value'])))) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_ellipsis(self): op_data = { @@ -735,7 +731,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'ellipsis') self.assertEqual(result['shape'], '[]') @@ -743,6 +739,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Min'], '...') self.assertEqual(result['Mean'], '...') self.assertEqual(result['Norm'], '...') + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_torch_process_group(self): op_data = { @@ -752,7 +749,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'torch.ProcessGroup') self.assertEqual(result['shape'], '[]') @@ -760,6 +757,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Min'], '[0, 1]') self.assertEqual(result['Mean'], '[0, 1]') self.assertEqual(result['Norm'], '[0, 1]') + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_default_dtype(self): op_data = { @@ -769,10 +767,11 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], str(type(op_data['value']))) self.assertEqual(result['shape'], '[]') + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_md5(self): op_data = { @@ -782,10 +781,11 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}" self.assertEqual(result['md5'], expected_md5) + self.assertEqual(result['state'], 'input') class TestApiBatch(unittest.TestCase): diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_first_diff_analyze.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_first_diff_analyze.py index 5c5a2690780bd0c1ada134f48746430da337d48d..c7919efba7e0240ff0d3398357f2ad4d45c1df30 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_first_diff_analyze.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_first_diff_analyze.py @@ -3,38 +3,44 @@ from unittest.mock import patch import pandas as pd +from msprobe.core.common.const import Const, CompareConst from msprobe.core.common.utils import CompareException from msprobe.core.compare.diff_analyze.first_diff_analyze import FirstDiffAnalyze +from msprobe.core.compare.config import ModeConfig class TestFirstDiffAnalyze(unittest.TestCase): def setUp(self): self.header = ['NPU name', 'L2norm diff', - 'MaxRelativeErr', 'MinRelativeErr', 'MeanRelativeErr', 'NormRelativeErr'] + 'MaxRelativeErr', 'MinRelativeErr', 'MeanRelativeErr', 'NormRelativeErr', + 'state', 'api_origin_name'] self.data = [ - ['Functional.conv2d.0.forward.input.0', 1, '0.0%', '0.0%', '0.0%', '0.0%'], - ['Functional.conv2d.0.forward.input.1', 1, '99.0%', '99.0%', '99.0%', '99.0%'] + ['Functional.conv2d.0.forward.input.0', 1, '0.0%', '0.0%', '0.0%', '0.0%', 'input', 'Functional.conv2d.0.forward'], + ['Functional.conv2d.0.forward.input.1', 1, '99.0%', '99.0%', '99.0%', '99.0%', 'input', 'Functional.conv2d.0.forward'] ] self.result_df = pd.DataFrame(self.data, columns=self.header) @patch('msprobe.core.compare.diff_analyze.first_diff_analyze.thresholds', {'compare_metrics': ['MaxRelativeErr', 'NormRelativeErr'], 'MaxRelativeErr': [0.5]}) def test_single_metric_diff_check_true(self): - first_diff_analyze = FirstDiffAnalyze() + mode_config = ModeConfig(first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) result = first_diff_analyze.single_metric_diff_check('MaxRelativeErr', '60.0%') self.assertTrue(result) @patch('msprobe.core.compare.diff_analyze.first_diff_analyze.thresholds', {'compare_metrics': ['MaxRelativeErr', 'NormRelativeErr'], 'MaxRelativeErr': [0.5]}) def test_single_metric_diff_check_false(self): - first_diff_analyze = FirstDiffAnalyze() + mode_config = ModeConfig(first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) result = first_diff_analyze.single_metric_diff_check('MaxRelativeErr', '30.0%') self.assertFalse(result) @patch('msprobe.core.compare.diff_analyze.first_diff_analyze.thresholds', {'compare_metrics': ['MaxRelativeErr', 'NormRelativeErr'], 'NormRelativeErr': [0.5]}) def test_single_metric_diff_check_miss_threshold(self): - first_diff_analyze = FirstDiffAnalyze() + mode_config = ModeConfig(first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) with self.assertRaises(CompareException) as context: result = first_diff_analyze.single_metric_diff_check('MaxRelativeErr', '30.0%') self.assertEqual(context.exception.code, CompareException.MISSING_THRESHOLD_ERROR) @@ -42,65 +48,123 @@ class TestFirstDiffAnalyze(unittest.TestCase): @patch('msprobe.core.compare.diff_analyze.first_diff_analyze.thresholds', {'compare_metrics': ['MaxRelativeErr', 'NormRelativeErr'], 'MaxRelativeErr': [0.5, 1.0]}) def test_single_metric_diff_check_wrong_threshold(self): - first_diff_analyze = FirstDiffAnalyze() + mode_config = ModeConfig(first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) with self.assertRaises(CompareException) as context: result = first_diff_analyze.single_metric_diff_check('MaxRelativeErr', '30.0%') self.assertEqual(context.exception.code, CompareException.WRONG_THRESHOLD_ERROR) def test_single_api_check_within_threshold(self): result_slice = [ - ['Functional.conv2d.0.forward.input.0', 1, '0.0%', '0.0%', '0.0%', '0.0%'], - ['Functional.conv2d.0.forward.input.1', 1, '0.1%', '0.1%', '0.1%', '0.1%'] + ['Functional.conv2d.0.forward.input.0', 1, '0.0%', '0.0%', '0.0%', '0.0%', 'input', 'Functional.conv2d.0.forward'], + ['Functional.conv2d.0.forward.input.1', 1, '0.1%', '0.1%', '0.1%', '0.1%', 'input', 'Functional.conv2d.0.forward'] ] expected_result = { 'is_same': True, 'op_items': [ {'NPU name': 'Functional.conv2d.0.forward.input.0', 'L2norm diff': 1, 'MaxRelativeErr': '0.0%', 'MinRelativeErr': '0.0%', - 'MeanRelativeErr': '0.0%', 'NormRelativeErr': '0.0%'}, + 'MeanRelativeErr': '0.0%', 'NormRelativeErr': '0.0%', + 'state': 'input', 'api_origin_name': 'Functional.conv2d.0.forward'}, {'NPU name': 'Functional.conv2d.0.forward.input.1', 'L2norm diff': 1, 'MaxRelativeErr': '0.1%', 'MinRelativeErr': '0.1%', - 'MeanRelativeErr': '0.1%', 'NormRelativeErr': '0.1%'} + 'MeanRelativeErr': '0.1%', 'NormRelativeErr': '0.1%', + 'state': 'input', 'api_origin_name': 'Functional.conv2d.0.forward'} ] } - first_diff_analyze = FirstDiffAnalyze() + mode_config = ModeConfig(first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) result = first_diff_analyze.single_api_check(result_slice, self.header) self.assertEqual(result, expected_result) def test_single_api_check_exceed_threshold(self): result_slice = [ - ['Functional.conv2d.0.forward.input.0', 1, '88.0%', '88.0%', '88.0%', '88.0%'], - ['Functional.conv2d.0.forward.input.1', 1, '99.0%', '99.0%', '99.0%', '99.0%'] + ['Functional.conv2d.0.forward.input.0', 1, '88.0%', '88.0%', '88.0%', '88.0%', 'input', 'Functional.conv2d.0.forward'], + ['Functional.conv2d.0.forward.input.1', 1, '99.0%', '99.0%', '99.0%', '99.0%', 'input', 'Functional.conv2d.0.forward'] ] expected_result = { 'is_same': False, 'op_items': [ {'NPU name': 'Functional.conv2d.0.forward.input.0', 'L2norm diff': 1, 'MaxRelativeErr': '88.0%', 'MinRelativeErr': '88.0%', - 'MeanRelativeErr': '88.0%', 'NormRelativeErr': '88.0%'}, + 'MeanRelativeErr': '88.0%', 'NormRelativeErr': '88.0%', + 'state': 'input', 'api_origin_name': 'Functional.conv2d.0.forward'}, {'NPU name': 'Functional.conv2d.0.forward.input.1', 'L2norm diff': 1, 'MaxRelativeErr': '99.0%', 'MinRelativeErr': '99.0%', - 'MeanRelativeErr': '99.0%', 'NormRelativeErr': '99.0%'}, + 'MeanRelativeErr': '99.0%', 'NormRelativeErr': '99.0%', + 'state': 'input', 'api_origin_name': 'Functional.conv2d.0.forward'}, ] } - first_diff_analyze = FirstDiffAnalyze() + mode_config = ModeConfig(first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) result = first_diff_analyze.single_api_check(result_slice, self.header) self.assertEqual(result, expected_result) - def test_check(self): + def test_single_api_check_md5_same_true(self): + md5_header = CompareConst.MD5_COMPARE_RESULT_HEADER + [CompareConst.STACK, Const.STATE, Const.API_ORIGIN_NAME] + result_slice = [ + ['Functional.conv2d.0.forward.input.0', 'Functional.conv2d.0.forward.input.0', 'torch.int32', 'torch.int32', + '[]', '[]', '2144df1c', '2144df1c', 'pass', '', 'input', 'Functional.conv2d.0.forward'] + ] + expected_result = { + 'is_same': True, + 'op_items': [ + {CompareConst.NPU_NAME: 'Functional.conv2d.0.forward.input.0', + CompareConst.BENCH_NAME: 'Functional.conv2d.0.forward.input.0', + CompareConst.NPU_DTYPE: 'torch.int32', CompareConst.BENCH_DTYPE: 'torch.int32', + CompareConst.NPU_SHAPE: '[]', CompareConst.BENCH_SHAPE: '[]', + CompareConst.NPU_MD5: '2144df1c', CompareConst.BENCH_MD5: '2144df1c', + CompareConst.RESULT: 'pass', CompareConst.STACK: '', + Const.STATE: 'input', Const.API_ORIGIN_NAME: 'Functional.conv2d.0.forward' + } + ] + } + mode_config = ModeConfig(dump_mode=Const.MD5, first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) + result = first_diff_analyze.single_api_check(result_slice, md5_header) + self.assertEqual(result, expected_result) + + def test_single_api_check_md5_same_false(self): + md5_header = CompareConst.MD5_COMPARE_RESULT_HEADER + [CompareConst.STACK, Const.STATE, Const.API_ORIGIN_NAME] + result_slice = [ + ['Functional.conv2d.0.forward.input.0', 'Functional.conv2d.0.forward.input.0', 'torch.int32', 'torch.int32', + '[]', '[]', '2144df1c', '2100df1c', 'Different', '', 'input', 'Functional.conv2d.0.forward'] + ] + expected_result = { + 'is_same': False, + 'op_items': [ + {CompareConst.NPU_NAME: 'Functional.conv2d.0.forward.input.0', + CompareConst.BENCH_NAME: 'Functional.conv2d.0.forward.input.0', + CompareConst.NPU_DTYPE: 'torch.int32', CompareConst.BENCH_DTYPE: 'torch.int32', + CompareConst.NPU_SHAPE: '[]', CompareConst.BENCH_SHAPE: '[]', + CompareConst.NPU_MD5: '2144df1c', CompareConst.BENCH_MD5: '2100df1c', + CompareConst.RESULT: 'Different', CompareConst.STACK: '', + Const.STATE: 'input', Const.API_ORIGIN_NAME: 'Functional.conv2d.0.forward' + } + ] + } + mode_config = ModeConfig(dump_mode=Const.MD5, first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) + result = first_diff_analyze.single_api_check(result_slice, md5_header) + self.assertEqual(result, expected_result) + + def test_check_summary(self): expected_result = { 'Functional.conv2d.0.forward': { 'is_same': False, 'op_items': [ {'NPU name': 'Functional.conv2d.0.forward.input.0', 'L2norm diff': 1, 'MaxRelativeErr': '0.0%', 'MinRelativeErr': '0.0%', - 'MeanRelativeErr': '0.0%', 'NormRelativeErr': '0.0%'}, + 'MeanRelativeErr': '0.0%', 'NormRelativeErr': '0.0%', + 'state': 'input', 'api_origin_name': 'Functional.conv2d.0.forward'}, {'NPU name': 'Functional.conv2d.0.forward.input.1', 'L2norm diff': 1, 'MaxRelativeErr': '99.0%', 'MinRelativeErr': '99.0%', - 'MeanRelativeErr': '99.0%', 'NormRelativeErr': '99.0%'}, + 'MeanRelativeErr': '99.0%', 'NormRelativeErr': '99.0%', + 'state': 'input', 'api_origin_name': 'Functional.conv2d.0.forward'}, ] } } - first_diff_analyze = FirstDiffAnalyze() + mode_config = ModeConfig(first_diff_analyze=True) + first_diff_analyze = FirstDiffAnalyze(mode_config) result = first_diff_analyze.check(self.result_df) self.assertEqual(result, expected_result) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py index 5d01c3fdcbee48bad403c4749921b2936cdfc83d..5a4ca7de47f401c4eb97ad4c224dfd1ae0a92262 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py @@ -38,19 +38,19 @@ summary_line_3 = ['Functional_batch_norm_0_forward.output.2', 'Functional_batch_ line_input = ['Functional.batch.norm.0.forward.input.0', 'Functional.batch.norm.0.forward.input.0', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 1, 0.5, 1, 1, 0.95, 1, 1, 1, 1, 1, 1.01, 1, 1, 1, - 'Yes', ''] + 'Yes', '', 'input', 'Functional.batch.norm.0.forward'] line_1 = ['Functional.batch.norm.0.forward.output.0', 'Functional.batch.norm.0.forward.output.0', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 0.5, 1, 1, 0.59, 1, 'nan', 0, 1, 1, 19, 1, 1, 1, - 'Yes', ''] + 'Yes', '', 'output', 'Functional.batch.norm.0.forward'] line_2 = ['Functional.batch.norm.0.forward.output.1', 'Functional.batch.norm.0.forward.output.1', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.9, 0.5, 1, 1, 0.8, 1, 0, 0.12, 0, 1, 1, 0.1, 1, 1, - 'Yes', ''] + 'Yes', '', 'output', 'Functional.batch.norm.0.forward'] line_3 = ['Functional.batch.norm.0.forward.output.2', 'Functional.batch.norm.0.forward.output.2', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 0.5, 1.1e+10, 1, 0.85, 1, 9, 0.12, 0, 1, 1, 0.1, 1, 1, - 'Yes', ''] + 'Yes', '', 'output', 'Functional.batch.norm.0.forward'] base_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'test_highlight') @@ -379,13 +379,6 @@ class TestUtilsMethods(unittest.TestCase): add_highlight_row_info(color_list, num, highlight_err_msg) self.assertEqual(color_list, [(1, ["a", "b"]), (5, ["c", "highlight"])]) - def test_add_highlight_row_info_new(self): - color_list = [(1, ["a", "b"]), (5, ["c"])] - num = 6 - highlight_err_msg = "highlight" - add_highlight_row_info(color_list, num, highlight_err_msg) - self.assertEqual(color_list, [(1, ["a", "b"]), (5, ["c"]), (6, ["highlight"])]) - def test_update_highlight_err_msg(self): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index 77920cdd98dadb03f9a5c52c0af852221f1e39c3..afcdd25744bf030a98629902cce4d4efa44a78b8 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -103,7 +103,7 @@ class TestCompareRealData(unittest.TestCase): # index error with self.assertRaises(CompareException) as context: result = compare_real_data.read_dump_data(pd.DataFrame()) - self.assertEqual(context.exception.code, CompareException.INDEX_OUT_OF_BOUNDS_ERROR) + self.assertEqual(context.exception.code, CompareException.INVALID_KEY_ERROR) def test_save_cmp_result_success(self): file_reader = read_real_data @@ -235,3 +235,16 @@ class TestCompareRealData(unittest.TestCase): result = compare_real_data.do_multi_process(input_param, result_df) self.assertTrue(result.equals(o_result)) + + def test_handle_multi_process(self): + file_reader = read_real_data + mode_config = ModeConfig(dump_mode=Const.ALL) + cross_frame = False + compare_real_data = CompareRealData(file_reader, mode_config, cross_frame) + + func = compare_real_data.compare_ops + generate_dump_json(base_dir) + input_param = {'bench_json_path': os.path.join(base_dir, 'dump.json')} + lock = multiprocessing.Manager().RLock() + result = compare_real_data._handle_multi_process(func, input_param, result_df, lock) + self.assertTrue(result.equals(o_result)) diff --git a/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py b/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py index 5e59ac9b036aa4cf875007a1db33b8e035abfb10..7e99de16b90d2dc03f389848c2fe327f5af2d036 100644 --- a/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py +++ b/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py @@ -54,7 +54,13 @@ def run_real_data(dump_path_param, csv_path, framework, is_cross_frame=False): framework: 框架类型, pytorch或mindspore is_cross_frame: 是否进行跨框架比对,仅支持mindspore比pytorch, 其中pytorch为标杆 """ - mode_config = ModeConfig(stack_mode=False, auto_analyze=True, fuzzy_match=False, dump_mode=Const.ALL) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.ALL + } + mode_config = ModeConfig(**config_dict) if framework == Const.PT_FRAMEWORK: from msprobe.pytorch.compare.pt_compare import read_real_data diff --git a/debug/accuracy_tools/msprobe/visualization/compare/multi_mapping.py b/debug/accuracy_tools/msprobe/visualization/compare/multi_mapping.py index bcc7c0f31351a52e40acfd6824c6b2f8f49ffd52..b756c870c542a64722344aec5c50d990056115b0 100644 --- a/debug/accuracy_tools/msprobe/visualization/compare/multi_mapping.py +++ b/debug/accuracy_tools/msprobe/visualization/compare/multi_mapping.py @@ -13,13 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re from dataclasses import dataclass from msprobe.core.common.const import Const from msprobe.core.common.log import logger from msprobe.visualization.utils import GraphConst from msprobe.visualization.graph.graph import NodeOp, BaseNode -from msprobe.core.compare.utils import get_name_and_state +from msprobe.core.common.utils import CompareException @dataclass @@ -171,3 +172,37 @@ class MultiMapping: split_list = x.split(Const.COMMA) return split_list[0].strip(), split_list[-1].strip() return (x.strip(),) + + +def get_name_and_state(name): + """ + Get api/module name and state + example: + name = 'conv2d.forward.1.input.0' + return: ('conv2d.forward.1.', 'input') + + name = 'Functional.pad.0.backward.output.0' + return: ('Functional.pad.0.backward.', 'output') + + state type: input, output, kwargs, parameters, parameters_grad + """ + if not isinstance(name, str): + logger.error(f'Invalid name: {name}, type should be string, please check.') + raise CompareException(CompareException.INVALID_API_NAME_ERROR) + + if Const.PARAMS_GRAD in name.split(Const.SEP): + return name.split(Const.PARAMS_GRAD)[0], Const.PARAMS_GRAD + + split = re.split(Const.REGEX_FORWARD_BACKWARD, name) + if len(split) < 3: + logger.error(f'Invalid name string: {name}, can not be split by forward/backward, please check.') + raise CompareException(CompareException.INVALID_API_NAME_ERROR) + api = f'{split[0]}.{split[1]}.' + state_str = split[2] + match = re.match(r'^(\d+\.)?(input|output|kwargs|parameters)\..+$', state_str) + if not match: + raise CompareException(f'Invalid name string: {name}') + if match.group(1): + api = f'{api}{match.group(1)}' + state = match.group(2) + return api, state