From 910b6e8fa2d47855ce6daa6a1dcb41f2be35a254 Mon Sep 17 00:00:00 2001 From: i-robot Date: Tue, 29 Jul 2025 08:15:20 +0000 Subject: [PATCH 1/4] compare pre-research sync --- .../msprobe/core/common/const.py | 15 +- .../msprobe/core/common/utils.py | 65 ++- .../msprobe/core/compare/acc_compare.py | 90 ++-- .../msprobe/core/compare/compare_cli.py | 14 +- .../msprobe/core/compare/config.py | 4 +- .../msprobe/core/compare/highlight.py | 63 ++- .../compare/layer_mapping/layer_mapping.py | 11 +- .../core/compare/multiprocessing_compute.py | 43 +- .../msprobe/core/compare/utils.py | 234 +++++------ .../docs/10.accuracy_compare_PyTorch.md | 386 ++++++++++-------- .../msprobe/mindspore/common/utils.py | 3 + .../mindspore/compare/common_dir_compare.py | 51 +-- .../mindspore/compare/distributed_compare.py | 34 +- .../msprobe/mindspore/compare/ms_compare.py | 3 +- .../mindspore/compare/ms_graph_compare.py | 9 +- .../msprobe/mindspore/compare/utils.py | 10 +- .../pytorch/compare/distributed_compare.py | 38 +- .../msprobe/pytorch/compare/pt_compare.py | 9 +- .../msprobe/test/core_ut/common/test_utils.py | 33 +- .../test/core_ut/compare/test_acc_compare.py | 65 +-- .../core_ut/compare/test_acc_compare_utils.py | 242 ++++------- .../core_ut/compare/test_cmp_highlight.py | 92 ++++- .../test_cmp_multiprocessing_compute.py | 23 +- 23 files changed, 836 insertions(+), 701 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index e8b5814e11..964c90b7cf 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -149,6 +149,10 @@ class Const: MODULE_PREFIX = ["Module", "Cell"] FORWARD_NAME_SUFFIX = ".forward" + DUMP_JSON_FILE = "dump_json_file" + DEBUG_JSON_FILE = "debug_json_file" + STACK_JSON_FILE = "stack_json_file" + # struct json param ORIGIN_DATA = "origin_data" SCOPE = "scope" @@ -236,6 +240,8 @@ class Const: MEAN = 'Mean' NORM = 'Norm' DATA_NAME = 'data_name' + STATE = 'state' + API_ORIGIN_NAME = 'api_origin_name' TENSOR_STAT_INDEX = 'tensor_stat_index' SUMMARY_METRICS_LIST = [MAX, MIN, MEAN, NORM] @@ -391,6 +397,7 @@ class CompareConst: Class for compare module const """ SPACE = " " + NAME = "Name" # compare result column name NPU_NAME = "NPU Name" BENCH_NAME = "Bench Name" @@ -434,6 +441,7 @@ class CompareConst: OUTPUT_STRUCT = "output_struct" PARAMS_STRUCT = "params_struct" PARAMS_GRAD_STRUCT = "params_grad_struct" + DEBUG_STRUCT = "debug_struct" SUMMARY = "summary" COMPARE_RESULT = "compare_result" COMPARE_MESSAGE = "compare_message" @@ -540,7 +548,8 @@ class CompareConst: Const.KWARGS: INPUT_STRUCT, Const.OUTPUT: OUTPUT_STRUCT, Const.PARAMS: PARAMS_STRUCT, - Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT + Const.PARAMS_GRAD: PARAMS_GRAD_STRUCT, + Const.DEBUG: DEBUG_STRUCT } # compare standard @@ -643,9 +652,9 @@ class CompareConst: OP_NAME_X = 'op_name_x' MATCH_RESULT_COLUMNS = [ - OP_NAME_X, 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x', + OP_NAME_X, 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'state_x', 'api_origin_name_x', 'data_name_x', CMP_KEY, CMP_SHAPE, - 'op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'data_name_y', + 'op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'state_y', 'api_origin_name_y', 'data_name_y' ] INTERNAL_API_MAPPING_FILE = 'ms_to_pt_api.yaml' diff --git a/debug/accuracy_tools/msprobe/core/common/utils.py b/debug/accuracy_tools/msprobe/core/common/utils.py index 0ed64c34c4..04e69dccbb 100644 --- a/debug/accuracy_tools/msprobe/core/common/utils.py +++ b/debug/accuracy_tools/msprobe/core/common/utils.py @@ -27,10 +27,15 @@ from msprobe.core.common.file_utils import (FileOpen, check_file_or_directory_pa from msprobe.core.common.const import Const, CompareConst from msprobe.core.common.log import logger from msprobe.core.common.exceptions import MsprobeException +from msprobe.core.common.decorator import recursion_depth_decorator device = collections.namedtuple('device', ['type', 'index']) prefixes = ['api_stack', 'list', 'range', 'acl'] +file_suffix_to_file_type = { + "dump.json": Const.DUMP_JSON_FILE, + "debug.json": Const.DEBUG_JSON_FILE, +} class MsprobeBaseException(Exception): @@ -219,17 +224,33 @@ def format_value(value): return float('{:.12f}'.format(value)) -def md5_find(data): - for key_op in data: - for api_info in data[key_op]: - if isinstance(data[key_op][api_info], list): - for data_detail in data[key_op][api_info]: - if data_detail and 'md5' in data_detail: - return True - if isinstance(data[key_op][api_info], bool): - continue - elif data[key_op][api_info] and 'md5' in data[key_op][api_info]: +@recursion_depth_decorator('msprobe.core.common.utils.md5_find', max_depth=Const.DUMP_MAX_DEPTH) +def md5_find(data, json_type=Const.DUMP_JSON_FILE): + if json_type == Const.DUMP_JSON_FILE: + for key_op in data: + for api_info in data[key_op]: + if isinstance(data[key_op][api_info], list): + for data_detail in data[key_op][api_info]: + if data_detail and Const.MD5 in data_detail: + return True + if isinstance(data[key_op][api_info], bool): + continue + elif data[key_op][api_info] and Const.MD5 in data[key_op][api_info]: + return True + elif json_type == Const.DEBUG_JSON_FILE: + if isinstance(data, dict): + if Const.MD5 in data: return True + else: + for _, data_info in data.items(): + if md5_find(data_info, Const.DEBUG_JSON_FILE): + return True + elif isinstance(data, list): + for data_info in data: + if md5_find(data_info, Const.DEBUG_JSON_FILE): + return True + else: + return False return False @@ -267,15 +288,28 @@ def get_stack_construct_by_dump_json_path(dump_json_path): def set_dump_path(input_param): npu_path = input_param.get("npu_json_path", None) bench_path = input_param.get("bench_json_path", None) - npu_path_valid = npu_path is not None and npu_path.endswith("dump.json") - bench_path_valid = bench_path is not None and bench_path.endswith("dump.json") - if not npu_path_valid or not bench_path_valid: + dump_json_path_valid = npu_path is not None and npu_path.endswith("dump.json") and \ + bench_path is not None and bench_path.endswith("dump.json") + debug_json_path_valid = npu_path is not None and npu_path.endswith("debug.json") and \ + bench_path is not None and bench_path.endswith("debug.json") + if not dump_json_path_valid and not debug_json_path_valid: logger.error(f"Please check the json path is valid and ensure that neither npu_path nor bench_path is None.") raise CompareException(CompareException.INVALID_PATH_ERROR) input_param[CompareConst.NPU_DUMP_DATA_DIR] = os.path.join(os.path.dirname(npu_path), Const.DUMP_TENSOR_DATA) input_param[CompareConst.BENCH_DUMP_DATA_DIR] = os.path.join(os.path.dirname(bench_path), Const.DUMP_TENSOR_DATA) +def get_file_type(file_path): + if not isinstance(file_path, str): + logger.error("get_file_type failed, check the type of file_path.") + raise CompareException(CompareException.INVALID_PATH_ERROR) + file_type = file_suffix_to_file_type.get(file_path.split(Const.SCOPE_SEPARATOR)[-1]) + if file_type is None: + logger.error("get_file_type failed, file_path is neither dump.json nor debug.json.") + raise CompareException(CompareException.INVALID_PATH_ERROR) + return file_type + + def check_dump_json_key(json_data, device_type): task = json_data.get('task', None) if not task: @@ -296,6 +330,7 @@ def get_dump_mode(input_param): bench_path = input_param.get("bench_json_path", None) npu_json_data = load_json(npu_path) bench_json_data = load_json(bench_path) + json_type = get_file_type(file_path=npu_path) npu_task, npu_api_data = check_dump_json_key(npu_json_data, 'npu') bench_task, bench_api_data = check_dump_json_key(bench_json_data, 'bench') @@ -311,8 +346,8 @@ def get_dump_mode(input_param): return Const.STRUCTURE if npu_task == Const.STATISTICS: - npu_md5_compare = md5_find(npu_api_data) - bench_md5_compare = md5_find(bench_api_data) + npu_md5_compare = md5_find(npu_api_data, json_type) + bench_md5_compare = md5_find(bench_api_data, json_type) if npu_md5_compare == bench_md5_compare: return Const.MD5 if npu_md5_compare else Const.SUMMARY else: diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 59c4b42ee2..7b34b73d2e 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -25,17 +25,16 @@ from tqdm import tqdm from msprobe.core.advisor.advisor import Advisor from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import load_json, remove_path, create_directory, save_json +from msprobe.core.common.file_utils import load_json, remove_path, create_directory from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, add_time_with_xlsx, check_op_str_pattern_valid, \ - set_dump_path, get_dump_mode, check_compare_param, check_configuration_param, load_stack_json, add_time_with_json + set_dump_path, get_dump_mode, check_compare_param, check_configuration_param, load_stack_json, get_file_type from msprobe.core.compare.check import check_dump_json_str, check_stack_json_str, cross_dtype_mapping from msprobe.core.compare.utils import merge_tensor, print_compare_ends_info, read_op, \ reorder_op_x_list, set_stack_json_path, check_api_info_len from msprobe.core.compare.config import ModeConfig, MappingConfig, MappingDict from msprobe.core.compare.multiprocessing_compute import CompareRealData from msprobe.core.compare.highlight import HighLight -from msprobe.core.compare.diff_analyze.first_diff_analyze import FirstDiffAnalyze @dataclass @@ -49,7 +48,7 @@ class ComparisonConfig: cell_mapping: dict api_mapping: dict layer_mapping: dict - first_diff_analyze: bool + compared_file_type: str class Comparator: @@ -61,11 +60,14 @@ class Comparator: self.mapping_dict = MappingDict(mapping_config) - def process_output_file(self, output_path, suffix): - if self.mode_config.first_diff_analyze: - file_name = add_time_with_json("compare_result" + suffix) - else: - file_name = add_time_with_xlsx("compare_result" + suffix) + @staticmethod + def process_output_file(output_path, suffix, compared_file_type): + file_name_prefix_mapping = { + Const.DUMP_JSON_FILE: "compare_result", + Const.DEBUG_JSON_FILE: "debug_compare_result" + } + file_name_prefix = file_name_prefix_mapping.get(compared_file_type, "compare_result") + file_name = add_time_with_xlsx(file_name_prefix + suffix) file_path = os.path.join(os.path.realpath(output_path), file_name) if os.path.exists(file_path): logger.warning(f"{file_path} will be deleted.") @@ -95,7 +97,7 @@ class Comparator: suffix = kwargs.get('suffix', '') # process output file - file_path = self.process_output_file(output_path, suffix) + file_path = self.process_output_file(output_path, suffix, self.mode_config.compared_file_type) # initialize the compare result table and compare general data(name, dtype, shape, statistics/md5, etc.) npu_json = input_param.get("npu_json_path") @@ -106,13 +108,6 @@ class Comparator: logger.warning("Can`t match any op. No compare result file generated.") return - if self.mode_config.first_diff_analyze: - first_diff_analyze = FirstDiffAnalyze() - check_result = first_diff_analyze.check(result_df) - save_json(file_path, check_result, indent=4) - logger.info(f"Saving json file to disk: {file_path}") - return - # compare real data if self.mode_config.dump_mode == Const.ALL: compare_real_data = CompareRealData(self.file_reader, self.mode_config, self.cross_frame) @@ -121,7 +116,9 @@ class Comparator: # highlight suspicious API highlight_dict = {"red_rows": set(), "yellow_rows": set(), "red_lines": [], "yellow_lines": []} highlight = HighLight(self.mode_config) - highlight.find_compare_result_error_rows(result_df, highlight_dict) + if self.mode_config.compared_file_type == Const.DUMP_JSON_FILE: + highlight.find_compare_result_error_rows(result_df, highlight_dict) + result_df.drop(columns=['state', 'api_origin_name'], inplace=True) # 删除中间数据,两列不落盘 highlight.highlight_rows_xlsx(result_df, highlight_dict, file_path) # output compare analysis suggestions @@ -183,10 +180,12 @@ class ParseData: Const.DTYPE: [], Const.SHAPE: [], Const.SUMMARY: [], - Const.STACK_INFO: [] + Const.STACK_INFO: [], + Const.STATE: [], + Const.API_ORIGIN_NAME: [] } if self.mode_config.dump_mode == Const.ALL: - result['data_name'] = [] + result[Const.DATA_NAME] = [] elif self.mode_config.dump_mode == Const.MD5: result[Const.MD5] = [] @@ -207,20 +206,22 @@ class ParseData: op_name_list = merge_list.get(CompareConst.OP_NAME) summary_list = merge_list.get(Const.SUMMARY) - data_name_list = merge_list.get('data_name') - op_name_reorder, summary_reorder, data_name_reorder = reorder_op_x_list(op_name_list, - summary_list, - data_name_list) + data_name_list = merge_list.get(Const.DATA_NAME) + state_list = merge_list.get(Const.STATE) + op_name_reorder, summary_reorder, data_name_reorder, state_reorder = reorder_op_x_list(op_name_list, + summary_list, + data_name_list, + state_list) # 遍历单个API的所有item - for index, op_name in enumerate(op_name_reorder): + for index, (op_name, state) in enumerate(zip(op_name_reorder, state_reorder)): result[CompareConst.OP_NAME].append(op_name) - if (CompareConst.INPUT_PATTERN in op_name) or (CompareConst.KWARGS_PATTERN in op_name): + if state == Const.INPUT or state == Const.KWARGS: info_list = merge_list[CompareConst.INPUT_STRUCT] - elif CompareConst.OUTPUT_PATTERN in op_name: + elif state == Const.OUTPUT: info_list = merge_list[CompareConst.OUTPUT_STRUCT] - elif CompareConst.PARAMS_PATTERN in op_name: + elif state == Const.PARAMS: info_list = merge_list[CompareConst.PARAMS_STRUCT] - elif CompareConst.PARAMS_GRAD_PATTERN in op_name: + elif state == Const.PARAMS_GRAD: info_list = merge_list[CompareConst.PARAMS_GRAD_STRUCT] else: info_list = merge_list[CompareConst.DEBUG_STRUCT] @@ -245,14 +246,18 @@ class ParseData: if self.mode_config.dump_mode == Const.ALL: check_api_info_len(op_name, data_name_reorder, 1) - result['data_name'].append(data_name_reorder.pop(0)) + result[Const.DATA_NAME].append(data_name_reorder.pop(0)) + + result[Const.STATE].append(state) + result[Const.API_ORIGIN_NAME].append(data_name) progress_bar.update(1) progress_bar.close() return pd.DataFrame(result) def gen_merge_list(self, json_data, op_name, stack_json_data): op_data = json_data['data'][op_name] - check_dump_json_str(op_data, op_name) + if self.mode_config.compared_file_type == Const.DUMP_JSON_FILE: + check_dump_json_str(op_data, op_name) op_parsed_list = read_op(op_data, op_name) if self.mode_config.stack_mode: @@ -416,8 +421,8 @@ class Match: @staticmethod def put_unmatched_in_table(match_result, npu_op_item): npu_columns = npu_op_item.index.tolist()[:-2] - new_columns = [name[:-1] + 'y' for name in npu_columns] - na_series = pd.Series([CompareConst.N_A] * len(new_columns), index=new_columns) + bench_columns = [name + '_y' for name in npu_columns] + na_series = pd.Series([CompareConst.N_A] * len(bench_columns), index=bench_columns) new_result_item = pd.concat([npu_op_item, na_series]).to_frame().T new_result_item.columns = CompareConst.MATCH_RESULT_COLUMNS match_result = pd.concat([match_result, new_result_item]) @@ -477,7 +482,6 @@ class Match: categories=op_name_order, ordered=True) match_result = match_result.sort_values(CompareConst.OP_NAME_X).reset_index(drop=True) match_result[CompareConst.OP_NAME_X] = match_result[CompareConst.OP_NAME_X].astype('object') - elif not self.mode_config.fuzzy_match: match_result = pd.merge(npu_df, bench_df, on=[CompareConst.CMP_KEY, CompareConst.CMP_SHAPE], how='outer') @@ -614,7 +618,9 @@ class CreateTable: 'md5_x': CompareConst.NPU_MD5, 'md5_y': CompareConst.BENCH_MD5, 'data_name_x': CompareConst.DATA_NAME, - 'stack_info_x': CompareConst.STACK}, inplace=True) + 'stack_info_x': CompareConst.STACK, + 'state_x': Const.STATE, + 'api_origin_name_x': Const.API_ORIGIN_NAME}, inplace=True) # process summary data npu_summary = [CompareConst.NPU_MAX, CompareConst.NPU_MIN, CompareConst.NPU_MEAN, CompareConst.NPU_NORM] @@ -627,6 +633,7 @@ class CreateTable: result[npu_summary] = result['summary_x'].apply(self.set_summary).tolist() result[bench_summary] = result['summary_y'].apply(self.set_summary).tolist() + header.extend([Const.STATE, Const.API_ORIGIN_NAME]) result_df = pd.DataFrame(columns=header) for h in header: if h in result.columns: @@ -690,7 +697,11 @@ class CalcStatsDiff: result_df[condition_no_bench] = result_df[condition_no_bench].fillna(CompareConst.N_A) result_df.loc[condition_no_bench, CompareConst.ERROR_MESSAGE] = CompareConst.NO_BENCH - if self.mode_config.first_diff_analyze or self.mode_config.dump_mode == Const.SUMMARY: + if self.mode_config.dump_mode == Const.MD5: + condition_md5_equal = result_df[CompareConst.NPU_MD5] == result_df[CompareConst.BENCH_MD5] + result_df.loc[condition_md5_equal, CompareConst.RESULT] = CompareConst.PASS + result_df.loc[~condition_md5_equal & ~condition_no_bench, CompareConst.RESULT] = CompareConst.DIFF + elif self.mode_config.dump_mode == Const.SUMMARY: warning_list = [ self.calc_summary_diff(result_df, condition_no_bench, stats_index) for stats_index in ['max', 'min', 'mean', 'l2norm'] @@ -699,10 +710,6 @@ class CalcStatsDiff: result_df.loc[~condition_no_bench, [CompareConst.RESULT, CompareConst.ERROR_MESSAGE]] = '' result_df.loc[warning_flag, CompareConst.RESULT] = CompareConst.WARNING result_df.loc[warning_flag, CompareConst.ERROR_MESSAGE] = 'Need double check api accuracy.' - elif self.mode_config.dump_mode == Const.MD5: - condition_md5_equal = result_df[CompareConst.NPU_MD5] == result_df[CompareConst.BENCH_MD5] - result_df.loc[condition_md5_equal, CompareConst.RESULT] = CompareConst.PASS - result_df.loc[~condition_md5_equal & ~condition_no_bench, CompareConst.RESULT] = CompareConst.DIFF else: fill_cols = [CompareConst.COSINE, CompareConst.EUC_DIST, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, @@ -727,11 +734,12 @@ def setup_comparison(input_param, output_path, **kwargs) -> ComparisonConfig: cell_mapping=kwargs.get('cell_mapping', {}), api_mapping=kwargs.get('api_mapping', {}), layer_mapping=kwargs.get('layer_mapping', {}), - first_diff_analyze=kwargs.get('first_diff_analyze', False) + compared_file_type='', ) set_dump_path(input_param) config.dump_mode = get_dump_mode(input_param) + config.compared_file_type = get_file_type(input_param.get("npu_json_path", None)) # set stack_mode and set "stack_json_path" in input_param if 'stack_json_path' in input_param: diff --git a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py index 7df7315043..91892a1aa8 100644 --- a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py +++ b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py @@ -14,7 +14,7 @@ # limitations under the License. import json -from msprobe.core.common.file_utils import check_file_type, load_json +from msprobe.core.common.file_utils import check_file_type, load_json, check_file_or_directory_path from msprobe.core.common.const import FileCheckConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.common.log import logger @@ -22,6 +22,9 @@ from msprobe.core.common.log import logger def compare_cli(args): input_param = load_json(args.input_path) + if not isinstance(input_param, dict): + logger.error("input_param should be dict, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) npu_path = input_param.get("npu_path", None) bench_path = input_param.get("bench_path", None) if not npu_path: @@ -38,6 +41,7 @@ def compare_cli(args): else: from msprobe.mindspore.compare.ms_compare import ms_compare from msprobe.mindspore.compare.distributed_compare import ms_compare_distributed, ms_graph_compare + from msprobe.mindspore.compare.common_dir_compare import common_dir_compare common_kwargs = { "auto_analyze": auto_analyze, @@ -46,6 +50,8 @@ def compare_cli(args): } if check_file_type(npu_path) == FileCheckConst.FILE and check_file_type(bench_path) == FileCheckConst.FILE: + check_file_or_directory_path(npu_path) + check_file_or_directory_path(bench_path) input_param["npu_json_path"] = input_param.pop("npu_path") input_param["bench_json_path"] = input_param.pop("bench_path") if "stack_path" not in input_param: @@ -67,6 +73,8 @@ def compare_cli(args): } ms_compare(input_param, args.output_path, **kwargs) elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR: + check_file_or_directory_path(npu_path, isdir=True) + check_file_or_directory_path(bench_path, isdir=True) kwargs = { **common_kwargs, "stack_mode": args.stack_mode, @@ -78,6 +86,10 @@ def compare_cli(args): if input_param.get("rank_id") is not None: ms_graph_compare(input_param, args.output_path) return + common = input_param.get("common", False) + if isinstance(common, bool) and common: + common_dir_compare(input_param, args.output_path) + return if frame_name == Const.PT_FRAMEWORK: compare_distributed(npu_path, bench_path, args.output_path, **kwargs) else: diff --git a/debug/accuracy_tools/msprobe/core/compare/config.py b/debug/accuracy_tools/msprobe/core/compare/config.py index 53fe857453..18ef50c26d 100644 --- a/debug/accuracy_tools/msprobe/core/compare/config.py +++ b/debug/accuracy_tools/msprobe/core/compare/config.py @@ -21,12 +21,12 @@ from msprobe.core.common.file_utils import load_yaml class ModeConfig: def __init__(self, stack_mode=False, auto_analyze=True, fuzzy_match=False, dump_mode=Const.SUMMARY, - first_diff_analyze=False): + compared_file_type=Const.DUMP_JSON_FILE): self.stack_mode = stack_mode self.auto_analyze = auto_analyze self.fuzzy_match = fuzzy_match self.dump_mode = dump_mode - self.first_diff_analyze = first_diff_analyze + self.compared_file_type = compared_file_type class MappingConfig: diff --git a/debug/accuracy_tools/msprobe/core/compare/highlight.py b/debug/accuracy_tools/msprobe/core/compare/highlight.py index 560ebcdc7a..37eec42169 100644 --- a/debug/accuracy_tools/msprobe/core/compare/highlight.py +++ b/debug/accuracy_tools/msprobe/core/compare/highlight.py @@ -29,7 +29,7 @@ from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.file_utils import save_workbook from msprobe.core.common.log import logger from msprobe.core.common.utils import get_header_index, safe_get_value -from msprobe.core.compare.utils import table_value_is_valid, get_name_and_state, CompareException, api_batches_update +from msprobe.core.compare.utils import table_value_is_valid, CompareException from msprobe.core.compare.config import ModeConfig @@ -160,10 +160,65 @@ class HighlightRules: } +class ApiBatch: + def __init__(self, api_name: str, start: int): + self.api_name = api_name + self.start = start + self.input_len = 1 # input的数量 + self.params_end_index = start + 1 # params的结束index + self.output_end_index = start + 1 # output的结束index + self.params_grad_end_index = start + 1 # params_grad的结束index + # 内部state的标志("input", "output", "parameters", "parameters_grad"), + # 用于控制计算input_len, output_end_index, params_end_index, self.params_grad_end_index + self._state = Const.INPUT # api_batch初始化为input + + def set_state(self, state: str): + """设置当前状态""" + if state in {Const.INPUT, Const.OUTPUT, Const.KWARGS, Const.PARAMS, Const.PARAMS_GRAD}: + self._state = state + else: + raise ValueError(f"Invalid state: {state}") + + def increment(self, state: str): + self.set_state(state) + if self._state == Const.INPUT or self._state == Const.KWARGS: + self.input_len += 1 + self.params_end_index += 1 + self.output_end_index += 1 + if self._state == Const.PARAMS: + self.params_end_index += 1 + self.output_end_index += 1 + if self._state == Const.OUTPUT: + self.output_end_index += 1 + self.params_grad_end_index += 1 + + class HighLight: def __init__(self, mode_config: ModeConfig): self.mode_config = mode_config + @staticmethod + def api_batches_update(api_batches, api_name, state, index): + """ + 当一个api的所有item更新完后,input, output的索引范围: + input: [start: start+input_len] + output: [start+input_len: output_end_index] + params: [output_end_index: params_end_index] + """ + if not api_batches: + api_batches.append(ApiBatch(api_name, index)) + else: + api_batch = api_batches[-1] + if api_batch.api_name == api_name or ( + not re.search(Const.REGEX_FORWARD_BACKWARD, api_name) and api_name in api_batch.api_name): + try: + api_batch.increment(state) + except ValueError as e: + logger.error(f"api_batch: {api_batch} with invalid state, please check! {e}") + raise CompareException(CompareException.INVALID_STATE_ERROR) from e + else: + api_batches.append(ApiBatch(api_name, index)) + @staticmethod def check_indices_numeric(api_items, indices: list): """检查指定索引处的值是否都为数字类型(int 或 float)""" @@ -220,9 +275,9 @@ class HighLight: result = result_df.values api_batches = [] for i, res_i in enumerate(result): - api_full_name = safe_get_value(res_i, 0, "res_i") - api_name, state = get_name_and_state(api_full_name) - api_batches_update(api_batches, api_name, state, i) + api_name = safe_get_value(res_i, -1, "res_i") # 内部定义倒数第一个元素必是api_origin_name + state = safe_get_value(res_i, -2, "res_i") # 内部定义倒数第二个元素必是state + self.api_batches_update(api_batches, api_name, state, i) with tqdm(total=len(api_batches), desc="API/Module Analyse Progress", unit="item", ncols=100) as progress_bar: for api_batch in api_batches: self.find_error_rows(result[api_batch.start: api_batch.params_grad_end_index], api_batch, diff --git a/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py b/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py index 4845adb048..91927f963a 100644 --- a/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py +++ b/debug/accuracy_tools/msprobe/core/compare/layer_mapping/layer_mapping.py @@ -208,7 +208,8 @@ def generate_data_mapping(npu_json_path, bench_json_path, api_mapping, output_pa def read_full_op_names(data, op_name): op_parsed_list = read_op(data.get(op_name, {}), op_name) full_op_names = [op_parsed.get('full_op_name') for op_parsed in op_parsed_list] - return full_op_names + states = [op_parsed.get(Const.STATE) for op_parsed in op_parsed_list] + return full_op_names, states def generate_op_data_mapping(npu_op_name, npu_full_op_names, bench_op_name, bench_full_op_names): suffix_to_full_op_name = {} @@ -228,10 +229,10 @@ def generate_data_mapping(npu_json_path, bench_json_path, api_mapping, output_pa for npu_op_name, bench_op_name in api_mapping.items(): if not npu_op_name: continue - npu_full_op_names = read_full_op_names(npu_data, npu_op_name) - bench_full_op_names = read_full_op_names(bench_data, bench_op_name) - npu_full_op_names_reorder = reorder_op_name_list(npu_full_op_names) - bench_full_op_names_reorder = reorder_op_name_list(bench_full_op_names) + npu_full_op_names, npu_states = read_full_op_names(npu_data, npu_op_name) + bench_full_op_names, bench_states = read_full_op_names(bench_data, bench_op_name) + npu_full_op_names_reorder, _ = reorder_op_name_list(npu_full_op_names, npu_states) + bench_full_op_names_reorder, _ = reorder_op_name_list(bench_full_op_names, bench_states) mapping = generate_op_data_mapping(npu_op_name, npu_full_op_names_reorder, bench_op_name, bench_full_op_names_reorder) data_mapping.update(mapping) diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index 8bba8c9aa0..cb0e13e383 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -94,8 +94,8 @@ class CompareRealData: @staticmethod def read_dump_data(result_df): try: - npu_dump_name_list = result_df.iloc[0:, 0].tolist() - dump_tensor_pair_list = result_df.iloc[0:, -1].tolist() + npu_dump_name_list = result_df.loc[0:, CompareConst.NPU_NAME].tolist() + dump_tensor_pair_list = result_df.loc[0:, CompareConst.DATA_NAME].tolist() op_name_mapping_dict = {} for index, npu_dump_name in enumerate(npu_dump_name_list): dump_tensor_pair = dump_tensor_pair_list[index] @@ -104,9 +104,9 @@ class CompareRealData: except ValueError as e: logger.error('result dataframe is not found.') raise CompareException(CompareException.INVALID_DATA_ERROR) from e - except IndexError as e: + except KeyError as e: logger.error('result dataframe elements can not be access.') - raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e + raise CompareException(CompareException.INVALID_KEY_ERROR) from e @staticmethod def _save_cmp_result(offset, result: ComparisonResult, result_df, lock): @@ -157,23 +157,20 @@ class CompareRealData: 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - relative_err, error_flag, err_msg = None, False, None + error_file, relative_err, error_flag = None, None, False data_name_pair = op_name_mapping_dict.get(npu_op_name) npu_data_name = data_name_pair[0] bench_data_name = data_name_pair[1] - error_file = data_name_pair - if str(npu_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有npu真实数据 - n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True - err_msg = "NPU does not have data file." + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True elif str(bench_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有bench真实数据 - n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True - err_msg = "Bench does not have data file." + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + error_file = 'no_bench_data' elif str(bench_data_name) == CompareConst.N_A: # bench没匹配 - n_value, b_value, error_flag = CompareConst.API_UNMATCH, CompareConst.API_UNMATCH, True - err_msg = "Bench api/module unmatched." + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + error_file = None else: npu_dir = input_param.get(CompareConst.NPU_DUMP_DATA_DIR) bench_dir = input_param.get(CompareConst.BENCH_DUMP_DATA_DIR) @@ -190,9 +187,8 @@ class CompareRealData: error_flag = True # 通过n_value, b_value同时得到错误标志和错误信息 - if not err_msg: - n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, error_flag=error_flag, - error_file=error_file) + n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, + error_flag=error_flag, error_file=error_file) result_list, err_msg = compare_ops_apply(n_value, b_value, error_flag, err_msg) @@ -222,16 +218,11 @@ class CompareRealData: = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param) if is_print_compare_log: - if "does not have data file" in err_msg: - logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") - elif "Bench api/module unmatched" in err_msg: - logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") - else: - logger.info( - "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ - one_thousand_err_ratio {}, " - "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, - err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) + logger.info( + "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ + one_thousand_err_ratio {}, " + "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, + err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) cos_result.append(cos_sim) euc_dist_result.append(euc_dist) max_err_result.append(max_abs_err) diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 4da1adfa09..0f5ea4a50d 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -26,28 +26,39 @@ from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value from msprobe.core.common.file_utils import check_file_or_directory_path +json_file_mapping = { + Const.DUMP_JSON_FILE: "dump.json", + Const.DEBUG_JSON_FILE: "debug.json", + Const.STACK_JSON_FILE: "stack.json" +} -def extract_json(dirname, stack_json=False): + +def extract_json(dirname, json_file_type): json_path = '' for filename in os.listdir(dirname): - target_file_name = 'stack.json' if stack_json else 'dump.json' + target_file_name = json_file_mapping.get(json_file_type) + if target_file_name is None: + logger.error(f'extract_json failed, invalid json_file_type: {json_file_type}.') + raise CompareException(CompareException.INVALID_KEY_ERROR) if filename == target_file_name: json_path = os.path.join(dirname, filename) break # Provide robustness on invalid directory inputs if not json_path: - if stack_json: + if json_file_type == Const.STACK_JSON_FILE: logger.warning(f'stack.json is not found in dump dir {dirname}.') - else: + elif json_file_type == Const.DUMP_JSON_FILE: logger.error(f'dump.json is not found in dump dir {dirname}.') - raise CompareException(CompareException.NO_DUMP_FILE_ERROR) + elif json_file_type == Const.DEBUG_JSON_FILE: + logger.warning(f'debug.json is not found in dump dir {dirname}.') + return json_path def set_stack_json_path(input_param): npu_data_dir = os.path.dirname(input_param.get("npu_json_path")) - stack_path = extract_json(npu_data_dir, stack_json=True) + stack_path = extract_json(npu_data_dir, json_file_type=Const.STACK_JSON_FILE) input_param["stack_json_path"] = stack_path if stack_path else None return bool(stack_path) @@ -83,17 +94,25 @@ def check_and_return_dir_contents(dump_dir, prefix): def read_op(op_data, op_name): - if Const.PARAMS_GRAD in op_name.split(Const.SEP): - op_parsed_list = op_item_parse(op_data, op_name) + if not isinstance(op_name, str): + logger.error(f"api name error: {op_name} is not a string, please check.") + raise CompareException(CompareException.INVALID_API_NAME_ERROR) + split_name = op_name.split(Const.SEP) + if split_name[-1] == Const.DEBUG: + op_parsed_list = op_item_parse(op_data, op_name, Const.DEBUG) + elif split_name[-1] == Const.PARAMS_GRAD: + op_parsed_list = op_item_parse(op_data, op_name, Const.PARAMS_GRAD) else: op_parsed_list = [] for name in CompareConst.IO_NAME_MAPPING: if name in op_data: - op_parsed_list.extend(op_item_parse(op_data[name], op_name + CompareConst.IO_NAME_MAPPING[name])) + op_parsed_list.extend(op_item_parse(op_data[name], op_name + CompareConst.IO_NAME_MAPPING[name], name)) return op_parsed_list -def op_item_parse(op_data, op_name: str, depth: int = 0) -> list: +def op_item_parse(op_data, op_name: str, state: str, depth: int = 0) -> list: + if state == Const.INPUT_ARGS or state == Const.INPUT_KWARGS: + state = Const.INPUT default_item = { 'full_op_name': op_name, 'type': None, @@ -105,7 +124,8 @@ def op_item_parse(op_data, op_name: str, depth: int = 0) -> list: 'shape': None, 'md5': None, 'value': None, - 'data_name': '-1' + 'data_name': '-1', + 'state': state } if depth > Const.MAX_DEPTH: @@ -121,14 +141,14 @@ def op_item_parse(op_data, op_name: str, depth: int = 0) -> list: if isinstance(op_data, list): for i, data in enumerate(op_data): if Const.PARAMS_GRAD not in op_name.split(Const.SEP): - item_list.extend(op_item_parse(data, op_name + Const.SEP + str(i), depth + 1)) + item_list.extend(op_item_parse(data, op_name + Const.SEP + str(i), state, depth + 1)) else: - item_list.extend(op_item_parse(data, op_name, depth + 1)) + item_list.extend(op_item_parse(data, op_name, state, depth + 1)) elif isinstance(op_data, dict): if is_leaf_data(op_data): - return [gen_op_item(op_data, op_name)] + return [gen_op_item(op_data, op_name, state)] for sub_name, sub_data in op_data.items(): - item_list.extend(op_item_parse(sub_data, op_name + Const.SEP + str(sub_name), depth + 1)) + item_list.extend(op_item_parse(sub_data, op_name + Const.SEP + str(sub_name), state, depth + 1)) return item_list @@ -136,14 +156,15 @@ def is_leaf_data(op_data): return 'type' in op_data and isinstance(op_data['type'], str) -def gen_op_item(op_data, op_name): +def gen_op_item(op_data, op_name, state): op_item = {} op_item.update(op_data) - data_name = op_data.get('data_name') if op_data.get('data_name') else '-1' # 如果是""也返回-1 - op_item['data_name'] = data_name + data_name = op_data.get(Const.DATA_NAME) if op_data.get(Const.DATA_NAME) else '-1' # 如果是""也返回-1 + op_item[Const.DATA_NAME] = data_name op_item['full_op_name'] = data_name.rsplit(Const.SEP, 1)[0] if data_name != '-1' else op_name + op_item[Const.STATE] = state - params = ['Max', 'Min', 'Mean', 'Norm'] + params = [Const.MAX, Const.MIN, Const.MEAN, Const.NORM] for i in params: if i not in op_item: op_item[i] = None @@ -191,8 +212,10 @@ def merge_tensor(tensor_list, dump_mode): CompareConst.OUTPUT_STRUCT, CompareConst.PARAMS_STRUCT, CompareConst.PARAMS_GRAD_STRUCT, + CompareConst.DEBUG_STRUCT, Const.SUMMARY, - Const.STACK_INFO + Const.STACK_INFO, + Const.STATE ] op_dict = {key: [] for key in keys} @@ -202,12 +225,13 @@ def merge_tensor(tensor_list, dump_mode): for tensor in tensor_list: # A dict(len=2) with 'full_op_name' and 'full_info' is added to the tensor only if self.stack_mode is True if len(tensor) == 2: - op_dict[Const.STACK_INFO].append(tensor['full_info']) + op_dict[Const.STACK_INFO].append(tensor.get('full_info')) break - op_dict[CompareConst.OP_NAME].append(tensor['full_op_name']) + op_dict[CompareConst.OP_NAME].append(tensor.get('full_op_name')) + state = tensor.get(Const.STATE) + op_dict[Const.STATE].append(state) - _, state = get_name_and_state(tensor['full_op_name']) struct_key = CompareConst.STATE_TO_STRUCT_MAPPING.get(state) if not struct_key: continue @@ -253,133 +277,54 @@ def table_value_is_valid(value: str) -> bool: return True -class ApiBatch: - def __init__(self, api_name: str, start: int): - self.api_name = api_name - self.start = start - self.input_len = 1 # input的数量 - self.params_end_index = start + 1 # params的结束index - self.output_end_index = start + 1 # output的结束index - self.params_grad_end_index = start + 1 # params_grad的结束index - # 内部state的标志("input", "output", "parameters", "parameters_grad"), - # 用于控制计算input_len, output_end_index, params_end_index, self.params_grad_end_index - self._state = Const.INPUT # api_batch初始化为input - - def set_state(self, state: str): - """设置当前状态""" - if state in {Const.INPUT, Const.OUTPUT, Const.KWARGS, Const.PARAMS, Const.PARAMS_GRAD}: - self._state = state - else: - raise ValueError(f"Invalid state: {state}") - - def increment(self, state: str): - self.set_state(state) - if self._state == Const.INPUT or self._state == Const.KWARGS: - self.input_len += 1 - self.params_end_index += 1 - self.output_end_index += 1 - if self._state == Const.PARAMS: - self.params_end_index += 1 - self.output_end_index += 1 - if self._state == Const.OUTPUT: - self.output_end_index += 1 - self.params_grad_end_index += 1 - - -def api_batches_update(api_batches, api_name, state, index): - """ - 当一个api的所有item更新完后,input, output的索引范围: - input: [start: start+input_len] - output: [start+input_len: output_end_index] - params: [output_end_index: params_end_index] - """ - if not api_batches: - api_batches.append(ApiBatch(api_name, index)) - else: - api_batch = api_batches[-1] - if api_batch.api_name == api_name or ( - not re.search(Const.REGEX_FORWARD_BACKWARD, api_name) and api_name in api_batch.api_name): - try: - api_batch.increment(state) - except ValueError as e: - logger.error(f"api_batch: {api_batch} with invalid state, please check! {e}") - raise CompareException(CompareException.INVALID_STATE_ERROR) from e - else: - api_batches.append(ApiBatch(api_name, index)) - - -def get_name_and_state(name): - """ - Get api/module name and state - example: - name = 'conv2d.forward.1.input.0' - return: ('conv2d.forward.1.', 'input') - - name = 'Functional.pad.0.backward.output.0' - return: ('Functional.pad.0.backward.', 'output') - - state type: input, output, kwargs, parameters, parameters_grad - """ - if not isinstance(name, str): - logger.error(f'Invalid name: {name}, type should be string, please check.') - raise CompareException(CompareException.INVALID_API_NAME_ERROR) - - if Const.PARAMS_GRAD in name.split(Const.SEP): - return name.split(Const.PARAMS_GRAD)[0], Const.PARAMS_GRAD - - split = re.split(Const.REGEX_FORWARD_BACKWARD, name) - if len(split) < 3: - logger.error(f'Invalid name string: {name}, can not be split by forward/backward, please check.') - raise CompareException(CompareException.INVALID_API_NAME_ERROR) - api = f'{split[0]}.{split[1]}.' - state_str = split[2] - match = re.match(r'^(\d+\.)?(input|output|kwargs|parameters)\..+$', state_str) - if not match: - raise CompareException(f'Invalid name string: {name}') - if match.group(1): - api = f'{api}{match.group(1)}' - state = match.group(2) - return api, state - - -def reorder_op_name_list(op_name_list): +def reorder_op_name_list(op_name_list, state_list): if not op_name_list: - return op_name_list + return op_name_list, state_list parameters = [] output = [] parameters_grad = [] others = [] - for x in op_name_list: - state = get_name_and_state(x)[1] + parameters_s = [] + output_s = [] + parameters_grad_s = [] + others_s = [] + for op_name, state in zip(op_name_list, state_list): if state == Const.PARAMS: - parameters.append(x) + parameters.append(op_name) + parameters_s.append(state) elif state == Const.OUTPUT: - output.append(x) + output.append(op_name) + output_s.append(state) elif state == Const.PARAMS_GRAD: - parameters_grad.append(x) + parameters_grad.append(op_name) + parameters_grad_s.append(state) else: - others.append(x) + others.append(op_name) + others_s.append(state) # 合并others, parameters, 和output,确保parameters排在output前面 op_name_reorder = others + parameters + output + parameters_grad - return op_name_reorder + state_reorder = others_s + parameters_s + output_s + parameters_grad_s + return op_name_reorder, state_reorder -def reorder_op_x_list(op_name_list, summary_list, data_name_list): - """对op_name, summary, data_name重新排序,把parameters放到input后output前,data_name由于统计量比对时,为None,单独处理""" +def reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list): + """ + 对op_name, summary, data_name, state重新排序,把parameters放到input后output前,data_name由于统计量比对时,为None,单独处理 + """ if not op_name_list or not summary_list: - return op_name_list, summary_list, data_name_list + return op_name_list, summary_list, data_name_list, state_list index_map = {name: index for index, name in enumerate(op_name_list)} - op_name_reorder = reorder_op_name_list(op_name_list) + op_name_reorder, state_order = reorder_op_name_list(op_name_list, state_list) summary_reorder = [summary_list[index_map.get(name)] for name in op_name_reorder] if data_name_list: data_name_reorder = [data_name_list[index_map.get(name)] for name in op_name_reorder] else: data_name_reorder = data_name_list - return op_name_reorder, summary_reorder, data_name_reorder + return op_name_reorder, summary_reorder, data_name_reorder, state_order def process_summary_data(summary_data): @@ -606,3 +551,40 @@ def _compare_parser(parser): help=" The data mapping file path.", required=False) parser.add_argument("-lm", "--layer_mapping", dest="layer_mapping", type=str, nargs='?', const=True, help=" The layer mapping file path.", required=False) + + +def compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, compare_func, **kwargs): + if kwargs.get('suffix'): + logger.error("Argument 'suffix' is not supported for compare_distributed.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + is_print_compare_log = kwargs.get('is_print_compare_log', True) + # get the ranks and match by order + npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) + bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) + if len(npu_ranks) != len(bench_ranks): + logger.error('The number of ranks in the two runs are different. ' + 'Unable to match the ranks. Please use another folder to compare ' + 'or use compare() api and manually match the ranks.') + raise CompareException(CompareException.INVALID_PATH_ERROR) + for nr, br in zip(npu_ranks, bench_ranks): + npu_data_dir = os.path.join(npu_dump_dir, nr) + bench_data_dir = os.path.join(bench_dump_dir, br) + for file_type in [Const.DUMP_JSON_FILE, Const.DEBUG_JSON_FILE]: + npu_path = extract_json(npu_data_dir, file_type) + bench_path = extract_json(bench_data_dir, file_type) + if npu_path == "" or bench_path == "": + logger.debug(f'Did not find paired {file_type} in {npu_data_dir} and {bench_data_dir},' + ' skip comparing.') + continue + dump_result_param = { + 'npu_json_path': npu_path, + 'bench_json_path': bench_path, + 'is_print_compare_log': is_print_compare_log + } + try: + compare_func(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}', **kwargs) + except CompareException as e: + if e.code == CompareException.INVALID_DATA_ERROR: + logger.error(f"Invalid or missing 'data' in dump.json. Skipping {nr} comparison.") + if e.code == CompareException.INVALID_TASK_ERROR: + logger.error(f"Invalid or missing 'task' in dump.json. Skipping {nr} comparison.") diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index 504e21b4b9..6727e01fe0 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -54,12 +54,12 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s | 参数名 | 说明 | 是否必选 | |-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | | -f 或 --framework | 指定训练框架。pytorch。 | 是 | -| -i 或 --input_path | 指定[比对文件](#214-比对文件),str 类型。 | 是 | +| -i 或 --input_path | 指定[比对文件](#51-比对文件),str 类型。 | 是 | | -o 或 --output_path | 配置比对结果文件存盘目录,str 类型,默认在当前目录创建output目录。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 否 | -| -s 或 --stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,根据[比对文件](#214-比对文件)的参数说明配置stack_path;多卡场景开启时,自动识别npu_dump目录下stack.json文件,如存在生成详细调用栈信息,否则不生成,此参数不生效。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | +| -s 或 --stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,根据[比对文件](#51-比对文件)的参数说明配置stack_path;多卡场景开启时,自动识别npu_dump目录下stack.json文件,如存在生成详细调用栈信息,否则不生成,此参数不生效。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | | -c 或 --compare_only | 仅比对开关,bool 类型。该参数默认未配置,会启用自动精度分析,工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 `advisor_{timestamp}.txt` 文件)。通过配置该参数取消自动精度分析,仅输出比对结果表格。 | 否 | | -f 或 --fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | -| -dm或--data_mapping | 自定义映射关系比对。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件](#215-自定义映射文件)。仅[API和模块无法自动匹配场景](#213-api和模块无法自动匹配场景)需要配置。仅支持逐卡比对,即使用[比对文件](#214-比对文件)的单卡场景示例。 | 否 | +| -dm或--data_mapping | 自定义映射关系比对。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件](#52-自定义映射文件)。仅[API和模块无法自动匹配场景](#213-api和模块无法自动匹配场景)需要配置。仅支持逐卡比对,即使用[比对文件](#51-比对文件)的单卡场景示例。 | 否 | #### 2.1.2 整网比对场景 @@ -69,7 +69,7 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s 1. 参见 [PyTorch 场景下的数据采集](./05.data_dump_PyTorch.md)章节完成 CPU 或 GPU 与 NPU 的精度数据 dump。 -2. 创建[比对文件](#214-比对文件)。 +2. 创建[比对文件](#51-比对文件)。 3. 运行命令: @@ -87,7 +87,7 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s 2. 参见[PyTorch 场景下的数据采集](./05.data_dump_PyTorch.md)章节完成 CPU 或 GPU 与 NPU 的精度数据 dump。 -3. 创建[比对文件](#214-比对文件)(单卡场景示例)。 +3. 创建[比对文件](#51-比对文件)(单卡场景示例)。 4. 运行命令: @@ -95,174 +95,30 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s msprobe -f pytorch compare -i ./compare.json -o ./output -s -dm data_mapping.yaml ``` - data_mapping.yaml文件配置请参见[自定义映射文件](#215-自定义映射文件)。 + data_mapping.yaml文件配置请参见[自定义映射文件](#52-自定义映射文件)。 该场景不支持-f模糊匹配。 5. 查看比对结果,请参见 [3 精度比对结果分析](#3-精度比对结果分析)。 -#### 2.1.4 比对文件 - 以在当前目录创建 ./compare.json 为例。 - - - 单卡场景示例: - - ```json - { - "npu_path": "./npu_dump/dump.json", - "bench_path": "./bench_dump/dump.json", - "stack_path": "./npu_dump/stack.json", - "is_print_compare_log": true - } - ``` - - - 多卡场景示例: - - ```json - { - "npu_path": "./npu_dump/step0", # 需填写到step层级(rank的上一层级) - "bench_path": "./bench_dump/step0", # 需填写到step层级(rank的上一层级) - "is_print_compare_log": true - } - ``` - -**参数说明**: - -| 参数名 | 说明 | 是否必选 | -| -------------------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------| -| npu_path | 配置 NPU 环境下的 dump.json 文件(单卡场景)或 dump 目录(多卡场景),str 类型。 | 是 | -| bench_path | 配置 CPU、GPU 或 NPU 环境下的 dump.json 文件(单卡场景)或 dump 目录(多卡场景),str 类型。 | 是 | -| stack_path | 配置 NPU dump 目录下的 stack.json 文件,str 类型。如果没有配置stack_path,命令行-s参数不生效,程序自动识别是否存在stack.json文件,如存在,则比对结果中呈现NPU_Stack_Info,如不存在,则不呈现。如果配置了stack_path,比对结果中是否呈现NPU_Stack_Info则通过命令行参数-s来控制。 | 否 | -| is_print_compare_log | 配置是否开启单个算子的日志打屏。可取值 true 或 false,默认为 true。关闭后则只输出常规日志,bool 类型。 | 否 | - -#### 2.1.5 自定义映射文件 - -文件名格式:*.yaml,*为文件名,可自定义。 - -文件内容格式: - -```yaml -# API -{api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号}: {api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号} -# 模块 -{Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号}: {Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号} -``` - -冒号左侧和右侧分别为PyTorch框架不同版本或不同芯片环境的API的名称和module模块名称。 +#### 2.1.4 单点数据比对场景 -API和模块名称请从《[PyTorch 场景的精度数据采集](05.data_dump_PyTorch.md)》中的dump.json文件获取。 - -文件内容示例: - -```yaml -# API -NPU.npu_fusion_attention.4.forward.input.0: NPU.npu_fusion_attention.4.forward.input.0 -# 模块 -Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0: Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0 -``` +单点数据比对场景是指:CPU 或 GPU 与 NPU环境的网络中单点保存的数据比对。 -当dump.json文件中存在“data_name”字段时,API和模块名称为data_name字段去掉文件后缀,如下图红框处所示: +支持单卡和多卡,可同时比对多卡的单点数据。多机场景需要每个设备单独执行比对操作。 -![pt_dump](./img/pt_dump.png) +1. 参见 [单点保存工具](./28.debugger_save_instruction.md)章节完成 CPU 或 GPU 与 NPU 的单点数据采集。 -当dump.json文件中不存在“data_name”字段时,名称的拼写规则如下: +2. 创建[比对文件(单点数据)](#53-比对文件单点数据)。 -input_args、input_kwargs和output使用统一的命名规则,当值是list类型时,名称后面添加'.{index}',当值类型是dict类型时,名称后面加'.{key}',当值类型是具体Tensor或null或int或float或bool或空list/dict等时,命名结束。 +3. 运行命令: -以下面api的dump文件为例: -```yaml - "Functional.max_pool2d.0.forward": { - "input_args": [ - { - "type": "torch.Tensor", - "dytpe": "torch_float32", - "shape": [ - 1, - 64, - 14, - 14 - ], - "Max": xxx, - "Min": xxx, - "Mean": xxx, - "Norm": xxx, - "requires_grad": true - }, - { - "type": "int", - "value": 3 - }, - { - "type": "int", - "value": 2 - }, - { - "type": "int", - "value": 1 - }, - { - "type": "int", - "value": 1 - } - ], - "input_kwargs": { - "ceil_mode": { - "type": "bool", - "value": false - }, - "return_indices": { - "type": "bool", - "value": false - }, - }, - "output": [ - { - "type": "torch.Tensor", - "dtype": "torch.float32", - "shape": [ - 1, - 64, - 7, - 7 - ], - "Max": xxx, - "Min": xxx, - "Mean": xxx, - "Norm": xxx, - "requires_grad": true - } - ] - } -``` + ```shell + msprobe -f pytorch compare -i ./compare.json -o ./output + ``` -初始名称为Functional.max_pool2d.0.forward,input_args是list,长度为5,第0项后面是Tensor,命名结束;第1-4项后面均是int,命名结束;按照顺序命名为 -``` -Functional.max_pool2d.0.forward.input.0 -Functional.max_pool2d.0.forward.input.1 -Functional.max_pool2d.0.forward.input.2 -Functional.max_pool2d.0.forward.input.3 -Functional.max_pool2d.0.forward.input.4 -``` -input_kwargs是dict,key是ceil_mode、return_indices,值均是bool,命名结束;命名为 -``` -Functional.max_pool2d.0.forward.input.ceil_mode -Functional.max_pool2d.0.forward.input.return_indices -``` -output是list,长度为1,第0项后面是Tensor,命名结束;按照顺序命名为 -``` -Functional.max_pool2d.0.forward.output.0 -``` -综上,生成的的op_name为 -``` -Functional.max_pool2d.0.forward.input.0 -Functional.max_pool2d.0.forward.input.1 -Functional.max_pool2d.0.forward.input.2 -Functional.max_pool2d.0.forward.input.3 -Functional.max_pool2d.0.forward.input.4 -Functional.max_pool2d.0.forward.input.ceil_mode -Functional.max_pool2d.0.forward.input.return_indices -Functional.max_pool2d.0.forward.output.0 -``` +4. 查看比对结果,请参见 [3 精度比对结果分析](#3-精度比对结果分析)。 ### 2.2 比对函数方式 @@ -412,18 +268,15 @@ MD5 模式: ### 3.5 错误信息提示(Err_message)——真实数据模式、统计数据模式 1. "Need double check api accuracy.":四个统计值中至少 1 个相对误差 > 0.5(统计数据模式); -2. "Fuzzy matching data, the comparison accuracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); -3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据者读取出错(真实数据模式); -4. "No bench data matched.":Bench 的 API 没有匹配上(真实数据模式,统计数据模式); -5. "NPU does not have data file.": NPU的真实数据不存在(真实数据模式); -6. "Bench does not have data file.": Bench的真实数据不存在(真实数据模式); -7. "Bench api/module unmatched.":Bench 的 API 没有匹配上(真实数据模式); -8. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); -9. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); -10. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); -11. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); -12. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); -13. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 +2. "Fuzzy matching data, the comparison arruracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); +3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据不存在或者读取出错(真实数据模式); +4. "No bench data matched.":Bench 的 API 没有匹配上、Bench 真实数据不存在或读取出错(真实数据模式); +5. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); +6. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); +7. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); +8. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); +9. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); +10. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 除以上错误信息提示外,异常数据颜色高亮标记的原因叠加呈现于此列。 @@ -437,7 +290,7 @@ MD5 模式: 4. MaxRelativeErr:当最大相对误差越接近 0 表示其计算的误差越小。 - 当 dump 数据中存在 0 或 Nan 时,比对结果中最大相对误差则出现 inf 或 Nan 的情况,属于正常现象。 + 当 dump 数据中存在 0 或 nan 时,比对结果中最大相对误差则出现 inf 或 nan 的情况,属于正常现象。 5. One Thousandth Err Ratio(相对误差小于千分之一的元素比例)、Five Thousandths Err Ratio(相对误差小于千分之五的元素比例)精度指标:是指 NPU 的 Tensor 中的元素逐个与对应的标杆数据对比,相对误差小于千分之一、千分之五的比例占总元素个数的比例。该数据仅作为精度下降趋势的参考,并不参与计算精度是否通过的判定。 @@ -516,4 +369,189 @@ compare_index: 6. Distributed.broadcast:输入为要广播的数据,输出为广播后的数据。 7. Distributed.isend:点对点通信,输入为要发送的数据,输出为发送的数据。 8. Distributed.irecv:点对点通信,输入为原数据,输出为接收的新数据。 -9. Distributed.all_to_all_single:输出数据为所有卡上的数据切分后合并的结果。 \ No newline at end of file +9. Distributed.all_to_all_single:输出数据为所有卡上的数据切分后合并的结果。 + +## 5 附录 + +### 5.1 比对文件 + + 以在当前目录创建 ./compare.json 为例。 + + - 单卡场景示例: + + ```json + { + "npu_path": "./npu_dump/dump.json", + "bench_path": "./bench_dump/dump.json", + "stack_path": "./npu_dump/stack.json", + "is_print_compare_log": true + } + ``` + + - 多卡场景示例: + + ```json + { + "npu_path": "./npu_dump/step0", # 需填写到step层级(rank的上一层级) + "bench_path": "./bench_dump/step0", # 需填写到step层级(rank的上一层级) + "is_print_compare_log": true + } + ``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------| +| npu_path | 配置NPU环境下的dump.json文件(单卡场景)或dump目录(多卡场景)。数据类型:str。 | 是 | +| bench_path | 配置CPU、GPU或NPU环境下的dump.json文件(单卡场景)或dump目录(多卡场景)。数据类型:str。 | 是 | +| stack_path | 配置NPU dump目录下的stack.json文件。数据类型:str。如果没有配置stack_path,命令行-s参数不生效,程序自动识别是否存在stack.json文件,如存在,则比对结果中呈现NPU_Stack_Info,如不存在,则不呈现。如果配置了stack_path,比对结果中是否呈现NPU_Stack_Info则通过命令行参数-s来控制。 | 否 | +| is_print_compare_log | 配置是否开启单个算子的日志打屏。可取值true或false,默认为true。关闭后则只输出常规日志。数据类型:bool。 | 否 | + +### 5.2 自定义映射文件 + +文件名格式:*.yaml,*为文件名,可自定义。 + +文件内容格式: + +```yaml +# API +{api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号}: {api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号} +# 模块 +{Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号}: {Module}.{module_name}.{前向反向}.{index}.{input/output}.{参数序号} +``` + +冒号左侧和右侧分别为PyTorch框架不同版本或不同芯片环境的API的名称和module模块名称。 + +API和模块名称请从《[PyTorch 场景的精度数据采集](05.data_dump_PyTorch.md)》中的dump.json文件获取。 + +文件内容示例: + +```yaml +# API +NPU.npu_fusion_attention.4.forward.input.0: NPU.npu_fusion_attention.4.forward.input.0 +# 模块 +Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0: Module.module.language_model.embedding.word_embedding.VocabParallelEmbedding.forward.0.input.0 +``` + +当dump.json文件中存在“data_name”字段时,API和模块名称为data_name字段去掉文件后缀,如下图红框处所示: + +![pt_dump](./img/pt_dump.png) + +当dump.json文件中不存在“data_name”字段时,名称的拼写规则如下: + +input_args、input_kwargs和output使用统一的命名规则,当值是list类型时,名称后面添加'.{index}',当值类型是dict类型时,名称后面加'.{key}',当值类型是具体Tensor或null或int或float或bool或空list/dict等时,命名结束。 + +以下面api的dump文件为例: +```yaml + "Functional.max_pool2d.0.forward": { + "input_args": [ + { + "type": "torch.Tensor", + "dytpe": "torch_float32", + "shape": [ + 1, + 64, + 14, + 14 + ], + "Max": xxx, + "Min": xxx, + "Mean": xxx, + "Norm": xxx, + "requires_grad": true + }, + { + "type": "int", + "value": 3 + }, + { + "type": "int", + "value": 2 + }, + { + "type": "int", + "value": 1 + }, + { + "type": "int", + "value": 1 + } + ], + "input_kwargs": { + "ceil_mode": { + "type": "bool", + "value": false + }, + "return_indices": { + "type": "bool", + "value": false + }, + }, + "output": [ + { + "type": "torch.Tensor", + "dtype": "torch.float32", + "shape": [ + 1, + 64, + 7, + 7 + ], + "Max": xxx, + "Min": xxx, + "Mean": xxx, + "Norm": xxx, + "requires_grad": true + } + ] + } +``` + +初始名称为Functional.max_pool2d.0.forward,input_args是list,长度为5,第0项后面是Tensor,命名结束;第1-4项后面均是int,命名结束;按照顺序命名为 +``` +Functional.max_pool2d.0.forward.input.0 +Functional.max_pool2d.0.forward.input.1 +Functional.max_pool2d.0.forward.input.2 +Functional.max_pool2d.0.forward.input.3 +Functional.max_pool2d.0.forward.input.4 +``` +input_kwargs是dict,key是ceil_mode、return_indices,值均是bool,命名结束;命名为 +``` +Functional.max_pool2d.0.forward.input.ceil_mode +Functional.max_pool2d.0.forward.input.return_indices +``` +output是list,长度为1,第0项后面是Tensor,命名结束;按照顺序命名为 +``` +Functional.max_pool2d.0.forward.output.0 +``` +综上,生成的的op_name为 +``` +Functional.max_pool2d.0.forward.input.0 +Functional.max_pool2d.0.forward.input.1 +Functional.max_pool2d.0.forward.input.2 +Functional.max_pool2d.0.forward.input.3 +Functional.max_pool2d.0.forward.input.4 +Functional.max_pool2d.0.forward.input.ceil_mode +Functional.max_pool2d.0.forward.input.return_indices +Functional.max_pool2d.0.forward.output.0 +``` + +### 5.3 比对文件(单点数据) + + - 单卡场景示例: + + ```json + { + "npu_path": "./npu_dump/debug.json", + "bench_path": "./bench_dump/debug.json" + } + ``` + + - 多卡场景示例(step0目录下包含debug.json文件): + + ```json + { + "npu_path": "./npu_dump/step0", + "bench_path": "./bench_dump/step0" + } + ``` \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py index d35e1b5194..187f66468d 100644 --- a/debug/accuracy_tools/msprobe/mindspore/common/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py @@ -97,6 +97,9 @@ def save_tensor_as_npy(tensor, file_path): def convert_to_int(value): + if isinstance(value, bool): + logger.error('The value in rank_id or step should be int, please check!') + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) try: return int(value) except Exception: diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py index 1f4ad8939c..4b509cc332 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/common_dir_compare.py @@ -29,28 +29,30 @@ from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException from msprobe.core.common.exceptions import FileCheckException from msprobe.core.common.file_utils import check_file_or_directory_path, write_df_to_csv, create_directory, \ - check_path_before_create, load_npy -from msprobe.core.common.const import CompareConst, FileCheckConst + check_path_before_create, load_npy +from msprobe.core.common.const import CompareConst from msprobe.core.compare.npy_compare import compare_ops_apply from msprobe.core.compare.multiprocessing_compute import check_accuracy +from msprobe.mindspore.compare.utils import check_name_map_dict def common_dir_compare(input_params: Dict, output_dir: str) -> Optional[pd.DataFrame]: """ 高级目录比对函数,完全镜像输入目录结构 - + Args: input_params: 包含npu_path和bench_path的字典 output_dir: 输出根目录 - + Returns: 当输入目录是平铺npy文件时返回DataFrame,否则返回None """ npu_root = Path(input_params.get('npu_path')) bench_root = Path(input_params.get('bench_path')) name_map_dict = input_params.get('map_dict', {}) + check_name_map_dict(name_map_dict) file_tree = build_mirror_file_tree(npu_root, bench_root) - + # 处理文件比对 with ProcessPoolExecutor() as executor: results = list(tqdm( @@ -67,29 +69,29 @@ def common_dir_compare(input_params: Dict, output_dir: str) -> Optional[pd.DataF def process_directory_pair(item: Tuple[Path, Tuple[Path, Path]], name_map_dict: Dict, output_dir: str): """ 处理一个目录对 - + Args: item: (相对路径, (npu目录, bench目录))元组 output_dir: 输出根目录 - + Returns: 比对结果的DataFrame(仅平铺结构时返回) """ rel_path, (npu_dir, bench_dir) = item - + # 创建镜像输出目录 output_path = Path(output_dir) / rel_path create_directory(output_path) - + # 生成文件映射 npu_files = find_npy_files(npu_dir) bench_files = find_npy_files(bench_dir) map_dict = generate_map_dict(npu_files, bench_files, name_map_dict) - + if not map_dict: logger.warning(f"No file pairs found in {rel_path}") return None - + # 执行比对 result_df = do_multi_process(process_chunk, map_dict) check_path_before_create(output_path) @@ -103,16 +105,16 @@ def process_directory_pair(item: Tuple[Path, Tuple[Path, Path]], name_map_dict: def build_mirror_file_tree(npu_root: Path, bench_root: Path) -> Dict[Path, Tuple[Path, Path]]: """ 构建镜像文件树,键为相对路径,值为(npu_path, bench_path)元组 - + Args: npu_root: NPU数据根目录 bench_root: 基准数据根目录 - + Returns: 文件树字典 """ file_tree = {} - + # 遍历NPU目录构建树结构 # 使用os.walk遍历目录,限制深度为10层 for root, dirs, files in os.walk(npu_root): @@ -121,23 +123,23 @@ def build_mirror_file_tree(npu_root: Path, bench_root: Path) -> Dict[Path, Tuple if depth > 10: dirs.clear() # 清空dirs列表以阻止继续递归 continue - + # 检查当前目录下是否有npy文件 if any(f.endswith('.npy') for f in files): # 获取相对路径 dir_path = Path(root).relative_to(npu_root) npu_dir_pair = os.path.join(npu_root, dir_path) bench_dir_pair = os.path.join(bench_root, dir_path) - + try: check_file_or_directory_path(bench_dir_pair, isdir=True) except FileCheckException: continue - + # 添加到文件树 if dir_path not in file_tree: file_tree[dir_path] = (npu_dir_pair, bench_dir_pair) - + return file_tree @@ -160,13 +162,13 @@ def find_npy_files(directory): file_name = base_name[0] logger.info(f"Generating file info for file: {file}") - + # 使用一致的分割逻辑 file_ele = file_name.split('_') - + if len(file_ele) < 2: continue - + key = '_'.join(file_ele[:-2]) if key: # 文件的完整路径 @@ -210,14 +212,14 @@ def do_multi_process(func, map_dict): df_chunks = [result_df] process_num = 1 logger.info(f"Using {process_num} processes with chunk size {df_chunk_size}") - + # 分割字典 map_chunks = split_dict(map_dict, df_chunk_size) - + # 创建结果列表和进程池 results = [] pool = multiprocessing.Pool(process_num) - + progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100) def update_progress(size, progress_lock, extra_param=None): @@ -230,6 +232,7 @@ def do_multi_process(func, map_dict): pool.close() except OSError as e: logger.error(f'pool terminate failed: {str(e)}') + results = [] try: # 提交任务到进程池 diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py index fa8b680709..5064bedcdb 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/distributed_compare.py @@ -13,47 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from msprobe.core.common.utils import CompareException from msprobe.core.common.file_utils import create_directory from msprobe.core.common.exceptions import FileCheckException from msprobe.mindspore.common.log import logger from msprobe.mindspore.compare.ms_compare import ms_compare -from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json +from msprobe.core.compare.utils import compare_distributed_inner from msprobe.mindspore.compare.ms_graph_compare import GraphMSComparator def ms_compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): - if kwargs.get('suffix'): - logger.error("Argument 'suffix' is not supported for compare_distributed.") - raise CompareException(CompareException.INVALID_PARAM_ERROR) - is_print_compare_log = kwargs.get('is_print_compare_log', True) - # get the ranks and match by order - npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) - bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) - if len(npu_ranks) != len(bench_ranks): - logger.error('The number of ranks in the two runs are different. ' - 'Unable to match the ranks. Please use another folder to compare ' - 'or use compare() api and manually match the ranks.') - raise CompareException(CompareException.INVALID_PATH_ERROR) - for nr, br in zip(npu_ranks, bench_ranks): - npu_data_dir = os.path.join(npu_dump_dir, nr) - bench_data_dir = os.path.join(bench_dump_dir, br) - npu_path = extract_json(npu_data_dir, stack_json=False) - bench_path = extract_json(bench_data_dir, stack_json=False) - - dump_result_param = { - 'npu_json_path': npu_path, - 'bench_json_path': bench_path, - 'is_print_compare_log': is_print_compare_log - } - try: - ms_compare(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}', **kwargs) - except CompareException as e: - if e.code == CompareException.INVALID_DATA_ERROR: - logger.error(f"Invalid or missing 'data' in dump.json. Skipping {nr} comparison.") - if e.code == CompareException.INVALID_TASK_ERROR: - logger.error(f"Invalid or missing 'task' in dump.json. Skipping {nr} comparison.") + compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, ms_compare, **kwargs) def ms_graph_compare(inputs, outputs): diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index dd2c6f8c10..42d973a0e8 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -35,7 +35,8 @@ def ms_compare(input_param, output_path, **kwargs): config.data_mapping = generate_data_mapping_by_layer_mapping(input_param, config.layer_mapping, output_path) is_cross_framework = check_cross_framework(input_param.get('bench_json_path')) - mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, config.dump_mode) + mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, + config.dump_mode, config.compared_file_type) mapping_config = MappingConfig(config.cell_mapping, config.api_mapping, config.data_mapping) ms_comparator = Comparator(read_real_data, mode_config, mapping_config, is_cross_framework) ms_comparator.compare_core(input_param, output_path, suffix=config.suffix) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py index ecf8e84d13..62b8551ae4 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py @@ -168,8 +168,13 @@ class GraphMSComparator: self.output_path = output_path self.base_npu_path = input_param.get('npu_path', None) self.base_bench_path = input_param.get('bench_path', None) - self.rank_list = [convert_to_int(rank_id) for rank_id in input_param.get('rank_id', [])] - self.step_list = [convert_to_int(step_id) for step_id in input_param.get('step_id', [])] + rank_id_list = input_param.get('rank_id', []) + step_id_list = input_param.get('step_id', []) + if not isinstance(rank_id_list, list) or not isinstance(step_id_list, list): + logger.error("'rank_id' and 'step_id' should both be lists, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) + self.rank_list = [convert_to_int(rank_id) for rank_id in rank_id_list] + self.step_list = [convert_to_int(step_id) for step_id in step_id_list] # split by rank and step, generate rank step path self.npu_rank_step_dict = self.generate_rank_step_path(self.base_npu_path) self.bench_rank_step_dict = self.generate_rank_step_path(self.base_bench_path) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/utils.py b/debug/accuracy_tools/msprobe/mindspore/compare/utils.py index 7a9c78e8f7..a6f9f4ae55 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/utils.py @@ -17,7 +17,8 @@ import os from msprobe.core.common.const import Const from msprobe.core.common.file_utils import load_npy, FileChecker, FileCheckConst -from msprobe.core.common.utils import detect_framework_by_dump_json +from msprobe.core.common.utils import detect_framework_by_dump_json, CompareException, check_op_str_pattern_valid +from msprobe.core.common.log import logger def read_npy_data(dir_path, file_name): @@ -35,3 +36,10 @@ def read_npy_data(dir_path, file_name): def check_cross_framework(bench_json_path): framework = detect_framework_by_dump_json(bench_json_path) return framework == Const.PT_FRAMEWORK + + +def check_name_map_dict(name_map_dict): + if not isinstance(name_map_dict, dict): + logger.error("'map_dict' should be a dict, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) + check_op_str_pattern_valid(str(name_map_dict)) diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py index b706a75445..6f8ad5cf60 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py @@ -13,43 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -from msprobe.core.common.utils import CompareException -from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json -from msprobe.pytorch.common.log import logger +from msprobe.core.compare.utils import compare_distributed_inner from msprobe.pytorch.compare.pt_compare import compare def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): - if kwargs.get("suffix"): - logger.error("Argument 'suffix' is not supported for compare_distributed.") - raise CompareException(CompareException.INVALID_PARAM_ERROR) - is_print_compare_log = kwargs.get("is_print_compare_log", True) - # get the ranks and match by order - npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) - bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) - if len(npu_ranks) != len(bench_ranks): - logger.error( - "The number of ranks in the two runs are different. " - "Unable to match the ranks. " - "Please use another folder to compare or use compare() api and manually match the ranks.") - raise CompareException(CompareException.INVALID_PATH_ERROR) - for nr, br in zip(npu_ranks, bench_ranks): - npu_data_dir = os.path.join(npu_dump_dir, nr) - bench_data_dir = os.path.join(bench_dump_dir, br) - npu_path = extract_json(npu_data_dir, stack_json=False) - bench_path = extract_json(bench_data_dir, stack_json=False) - - dump_result_param = { - "npu_json_path": npu_path, - "bench_json_path": bench_path, - "is_print_compare_log": is_print_compare_log - } - try: - compare(input_param=dump_result_param, output_path=output_path, suffix=f'_{nr}', **kwargs) - except CompareException as e: - if e.code == CompareException.INVALID_DATA_ERROR: - logger.error(f"Invalid or missing 'data' in dump.json. Skipping {nr} comparison.") - if e.code == CompareException.INVALID_TASK_ERROR: - logger.error(f"Invalid or missing 'task' in dump.json. Skipping {nr} comparison.") + compare_distributed_inner(npu_dump_dir, bench_dump_dir, output_path, compare, **kwargs) diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py index 8acaf70c3e..b73c52dce3 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from msprobe.core.common.utils import CompareException +from msprobe.core.common.log import logger from msprobe.core.compare.acc_compare import Comparator, ModeConfig, MappingConfig, setup_comparison from msprobe.pytorch.compare.utils import read_pt_data @@ -24,10 +26,13 @@ def read_real_data(npu_dir, npu_data_name, bench_dir, bench_data_name, _) -> tup def compare(input_param, output_path, **kwargs): + if not isinstance(input_param, dict): + logger.error("input_param should be dict, please check!") + raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) config = setup_comparison(input_param, output_path, **kwargs) - mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, config.dump_mode, - config.first_diff_analyze) + mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, + config.dump_mode, config.compared_file_type) mapping_config = MappingConfig(data_mapping=config.data_mapping) pt_comparator = Comparator(read_real_data, mode_config, mapping_config) pt_comparator.compare_core(input_param, output_path, suffix=config.suffix) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py index e6b2bbf059..4916fe6ce0 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py @@ -54,6 +54,7 @@ from msprobe.core.common.utils import (CompareException, is_json_file, detect_framework_by_dump_json, is_save_variable_valid, + get_file_type, check_dump_json_key) from msprobe.core.common.decorator import recursion_depth_decorator @@ -220,23 +221,49 @@ class TestUtils(TestCase): } input_param["npu_json_path"] = "npu_path" - with patch("msprobe.core.common.utils.load_json", return_value=npu_json): + with patch("msprobe.core.common.utils.load_json", return_value=npu_json), \ + patch("msprobe.core.common.utils.get_file_type", return_value=Const.DUMP_JSON_FILE): dump_mode = get_dump_mode(input_param) self.assertEqual(dump_mode, Const.ALL) npu_json["task"] = Const.STATISTICS with patch("msprobe.core.common.utils.load_json", return_value=npu_json), \ - patch("msprobe.core.common.utils.md5_find", return_value=True): + patch("msprobe.core.common.utils.md5_find", return_value=True), \ + patch("msprobe.core.common.utils.get_file_type", return_value=Const.DUMP_JSON_FILE): dump_mode = get_dump_mode(input_param) self.assertEqual(dump_mode, Const.MD5) npu_json["task"] = Const.OVERFLOW_CHECK - with patch("msprobe.core.common.utils.load_json", return_value=npu_json): + with patch("msprobe.core.common.utils.load_json", return_value=npu_json), \ + patch("msprobe.core.common.utils.get_file_type", return_value=Const.DUMP_JSON_FILE): with self.assertRaises(CompareException) as context: dump_mode = get_dump_mode(input_param) self.assertEqual(context.exception.code, CompareException.INVALID_TASK_ERROR) mock_error.assert_called_with("Compare applies only to task is tensor or statistics") + def test_get_file_type(self): + # 测试有效的 file_path (dump.json) + file_path = 'path/to/dump.json' + expected_file_type = Const.DUMP_JSON_FILE + self.assertEqual(get_file_type(file_path), expected_file_type) + + # 测试有效的 file_path (debug.json) + file_path = 'path/to/debug.json' + expected_file_type = Const.DEBUG_JSON_FILE + self.assertEqual(get_file_type(file_path), expected_file_type) + + # 测试无效的 file_path + file_path = 'path/to/unknown.json' + with self.assertRaises(CompareException) as context: + get_file_type(file_path) + self.assertEqual(context.exception.code, CompareException.INVALID_PATH_ERROR) + + # 测试非字符串类型的 file_path + file_path = 12345 # 非字符串类型 + with self.assertRaises(CompareException) as context: + get_file_type(file_path) + self.assertEqual(context.exception.code, CompareException.INVALID_PATH_ERROR) + @patch('msprobe.core.common.file_utils.get_file_content_bytes') def test_get_json_contents_should_raise_exception(self, mock_get_file_content_bytes): mock_get_file_content_bytes.return_value = 'not a dict' diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py index ee15d9b06e..639be03606 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py @@ -343,13 +343,15 @@ class TestUtilsMethods(unittest.TestCase): op_name = 'Functional.linear.0.forward' stack_json_data = {'Functional.linear.0.forward': ['File']} merge_list = { + 'debug_struct': [], 'input_struct': [('torch.float32', [2, 2])], 'op_name': ['Functional.linear.0.forward.input.0'], 'output_struct': [], 'params_struct': [], 'params_grad_struct': [], 'stack_info': [['File']], - 'summary': [[1, 1, 1, 1]] + 'summary': [[1, 1, 1, 1]], + 'state': ['input'] } stack_mode = True @@ -393,10 +395,10 @@ class TestUtilsMethods(unittest.TestCase): o_data = [ ['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', '[2, 2]', '[2, 2]', 0, 0, 0, 0, '0.0%', 'N/A', '0.0%', '0.0%', - 2, 0, 1, 1, 2, 0, 1, 1, '', '', ['File'] + 2, 0, 1, 1, 2, 0, 1, 1, '', '', ['File'], 'input', 'Functional.linear.0.forward' ] ] - columns = CompareConst.SUMMARY_COMPARE_RESULT_HEADER + ['NPU_Stack_Info'] + columns = CompareConst.SUMMARY_COMPARE_RESULT_HEADER + ['NPU_Stack_Info'] + ['state', 'api_origin_name'] o_result = pd.DataFrame(o_data, columns=columns, dtype=object) self.assertTrue(np.array_equal(result.to_numpy(), o_result.to_numpy())) @@ -425,8 +427,8 @@ class TestParseData(unittest.TestCase): npu_df, bench_df = parse_data.parse(file_list) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File']]], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward']], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name'] ) self.assertTrue(npu_df.equals(target_df)) self.assertTrue(bench_df.equals(target_df)) @@ -443,8 +445,8 @@ class TestParseData(unittest.TestCase): npu_df = parse_data.gen_data_df(npu_json_data, stack_json_data) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File']]], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward']], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name'] ) self.assertTrue(npu_df.equals(target_df)) @@ -460,8 +462,8 @@ class TestParseData(unittest.TestCase): npu_df = parse_data.gen_data_df(npu_json_data, stack_json_data) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'Functional.linear.0.forward.input.0.pt']], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'data_name'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward', 'Functional.linear.0.forward.input.0.pt']], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'data_name'] ) self.assertTrue(npu_df.equals(target_df)) @@ -477,8 +479,8 @@ class TestParseData(unittest.TestCase): npu_df = parse_data.gen_data_df(npu_json_data, stack_json_data) target_df = pd.DataFrame( - [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 123456]], - columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'md5'] + [['Functional.linear.0.forward.input.0', 'torch.float32', [2, 2], [2, 0, 1, 1], ['File'], 'input', 'Functional.linear.0.forward', 123456]], + columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'md5'] ) self.assertTrue(npu_df.equals(target_df)) @@ -494,13 +496,15 @@ class TestParseData(unittest.TestCase): merge_list = parse_data.gen_merge_list(npu_json_data, 'Functional.linear.0.forward', stack_json_data) target_dict = { + 'debug_struct': [], 'input_struct': [('torch.float32', [2, 2])], 'op_name': ['Functional.linear.0.forward.input.0'], 'output_struct': [], 'params_grad_struct': [], 'params_struct': [], 'stack_info': [['File']], - 'summary': [[2, 0, 1, 1]] + 'summary': [[2, 0, 1, 1]], + 'state': ['input'] } self.assertEqual(merge_list, target_dict) @@ -670,13 +674,14 @@ class TestMatch(unittest.TestCase): match = Match(mode_config, mapping_config, cross_frame=False) match_result = pd.DataFrame(columns=CompareConst.MATCH_RESULT_COLUMNS) - npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2]], - index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x', + npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2]], + index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', + 'state_x', 'api_origin_name_x', 'data_name_x', 'compare_key', 'compare_shape'] ) match_result = match.put_unmatched_in_table(match_result, npu_op_item) - target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2], - 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']], + target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2], + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']], columns=CompareConst.MATCH_RESULT_COLUMNS) self.assertTrue(match_result.equals(target_match_result)) @@ -686,17 +691,17 @@ class TestMatch(unittest.TestCase): match = Match(mode_config, mapping_config, cross_frame=False) match_result = pd.DataFrame(columns=CompareConst.MATCH_RESULT_COLUMNS) - npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2]], - index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'data_name_x', + npu_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2]], + index=['op_name_x', 'dtype_x', 'shape_x', 'summary_x', 'stack_info_x', 'state_x', 'api_origin_name_x', 'data_name_x', 'compare_key', 'compare_shape'] ) - bench_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2]], - index=['op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'data_name_y', + bench_op_item = pd.Series(['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2]], + index=['op_name_y', 'dtype_y', 'shape_y', 'summary_y', 'stack_info_y', 'state_y', 'api_origin_name_y', 'data_name_y', 'compare_key', 'compare_shape'] ) match_result = match.put_matched_in_table(match_result, npu_op_item, bench_op_item) - target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name', 'op', [1, 2], - 'op', 'float32', [1, 2], 'summary', 'stack_info', 'data_name']], + target_match_result = pd.DataFrame([['op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name', 'op', [1, 2], + 'op', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'op_origin', 'data_name']], columns=CompareConst.MATCH_RESULT_COLUMNS) self.assertTrue(match_result.equals(target_match_result)) @@ -739,19 +744,19 @@ class TestMatch(unittest.TestCase): match = Match(mode_config, mapping_config, cross_frame=False) npu_df = pd.DataFrame([ - ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2]], - ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2]] - ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'data_name', 'compare_key', 'compare_shape']) + ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.3.forward','Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2]], + ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.1.forward', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2]] + ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'data_name', 'compare_key', 'compare_shape']) bench_df = pd.DataFrame([ - ['Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.0.forward.input.0.pt', 'Functional.conv2d.0.forward.input.0', [1, 2]], - ['Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.0.forward.input.0', [1, 2]] - ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'data_name', 'compare_key', 'compare_shape']) + ['Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.0.forward', 'Functional.conv2d.0.forward.input.0.pt', 'Functional.conv2d.0.forward.input.0', [1, 2]], + ['Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.0.forward', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.0.forward.input.0', [1, 2]] + ], columns=['op_name', 'dtype', 'shape', 'summary', 'stack_info', 'state', 'api_origin_name', 'data_name', 'compare_key', 'compare_shape']) match_result = match.process_fuzzy_match(npu_df, bench_df) expected = pd.DataFrame( [ - ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2], 'Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.conv2d.0.forward.input.0.pt'], - ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2], 'Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'Functional.amax.0.forward.input.0.pt'] + ['Functional.conv2d.3.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.3.forward', 'Functional.conv2d.3.forward.input.0.pt', 'Functional.conv2d.3.forward.input.0', [1, 2], 'Functional.conv2d.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.conv2d.0.forward', 'Functional.conv2d.0.forward.input.0.pt'], + ['Functional.amax.1.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.1.forward', 'Functional.amax.0.forward.input.0.pt', 'Functional.amax.1.forward.input.0', [1, 2], 'Functional.amax.0.forward.input.0', 'float32', [1, 2], 'summary', 'stack_info', 'input', 'Functional.amax.0.forward', 'Functional.amax.0.forward.input.0.pt'] ] , columns=CompareConst.MATCH_RESULT_COLUMNS) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 6265e31cfc..4d264dd74c 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -13,8 +13,7 @@ from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, get_rela_diff_summary_mode, merge_tensor, op_item_parse, read_op, result_item_init, \ - stack_column_process, table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, \ - gen_op_item, ApiBatch + stack_column_process, table_value_is_valid, reorder_op_name_list, reorder_op_x_list, gen_op_item # test_read_op_1 op_data = { @@ -32,15 +31,15 @@ op_name = "Tensor.add_0.0.forward" op_result = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'data_name': '-1', - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.input.0'}, + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, 'data_name': '-1', - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.forward.input.1'}, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.forward.input.1', 'state': 'input'}, {'full_op_name': 'Tensor.add_0.0.forward.input.alpha', 'dtype': "", 'shape': '[]', 'md5': '0dae4479', - 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'type': 'float', 'value': -0.1}, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'type': 'float', 'value': -0.1, 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'data_name': '-1', - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.output.0'}] + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.output.0', 'state': 'output'}] # test_read_op_1 op_data_b = { @@ -57,13 +56,13 @@ op_name_b = "Tensor.add_0.0.backward" op_result_b = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.input.0'}, + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.backward.input.1'}, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.backward.input.1', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.output.0'}] + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.output.0', 'state': 'output'}] # test_op_item_parse parse_item = [ @@ -79,12 +78,12 @@ parse_top_bool = True o_result_parse = [ {'Max': 4097.0, 'Mean': 820.2, 'Min': 0.0, 'Norm': 4097.0, 'dtype': 'torch.int64', 'requires_grad': False, 'shape': [5], 'type': 'torch.Tensor', 'full_op_name': 'Distributed.broadcast.0.forward.input.0', - 'data_name': '-1', 'md5': '00000000'}, + 'data_name': '-1', 'md5': '00000000', 'state': 'input'}, {'full_op_name': 'Distributed.broadcast.0.forward.input.1', 'dtype': "", 'shape': '[]', - 'md5': 'f4dbdf21', 'Max': 0, 'Min': 0, 'Mean': 0, 'Norm': 0, 'data_name': '-1', 'type': 'int', 'value': 0}, + 'md5': 'f4dbdf21', 'Max': 0, 'Min': 0, 'Mean': 0, 'Norm': 0, 'data_name': '-1', 'type': 'int', 'value': 0, 'state': 'input'}, {'Max': None, 'Mean': None, 'Min': None, 'Norm': None, 'data_name': '-1', 'dtype': 'slice', 'type': 'slice', 'full_op_name': 'Distributed.broadcast.0.forward.input.2', 'md5': '5fbbe87f', 'shape': '(3,)', - 'value': [None, None, None]} + 'value': [None, None, None], 'state': 'input'} ] # test_resolve_api_special_parameters @@ -255,15 +254,15 @@ o_result_unmatch_3 = [ tensor_list = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'Norm': 2.2533628940582275, 'requires_grad': True, - 'full_op_name': 'Tensor.add_.0.forward.input.0'}, + 'full_op_name': 'Tensor.add_.0.forward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.1'}, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.1', 'state': 'input'}, {'full_op_name': 'Tensor.add_.0.forward.input.alpha.0', 'dtype': "", "shape": '[]', 'md5': None, - 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1'}, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0'} + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0', 'state': 'output'} ] result_op_dict = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_.0.forward.input.1', 'Tensor.add_.0.forward.input.alpha.0', 'Tensor.add_.0.forward.output.0'], @@ -272,22 +271,24 @@ result_op_dict = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_.0.fo 'output_struct': [('torch.float32', [16, 1, 3, 3])], 'params_struct': [], 'params_grad_struct': [], + 'debug_struct': [], 'summary': [[0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275], [0.003992878366261721, -0.008102823048830032, -0.0002002553956117481, 0.02844562754034996], [-0.1, -0.1, -0.1, -0.1], [0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275]], - 'stack_info': []} + 'stack_info': [], + 'state': ['input', 'input', 'input', 'output']} tensor_list_md5 = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.0', 'md5': 1}, + 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_.0.forward.input.0', 'md5': 1, 'state': 'input'}, {'full_op_name': 'Tensor.add_.0.forward.kwargs.alpha.0', 'dtype': "", "shape": '[]', 'md5': None, - 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1'}, + 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0', 'md5': 2} + 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_.0.forward.output.0', 'md5': 2, 'state': 'output'} ] result_op_dict_md5 = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_.0.forward.kwargs.alpha.0', 'Tensor.add_.0.forward.output.0'], @@ -295,18 +296,20 @@ result_op_dict_md5 = {'op_name': ['Tensor.add_.0.forward.input.0', 'Tensor.add_. 'output_struct': [('torch.float32', [16, 1, 3, 3], 2)], 'params_struct': [], 'params_grad_struct': [], + 'debug_struct': [], 'summary': [ [0.003992878366261721, -0.008102823048830032, -0.0002002553956117481, 0.02844562754034996], [-0.1, -0.1, -0.1, -0.1], [0.33033010363578796, -0.331031858921051, -0.030964046716690063, 2.2533628940582275]], - 'stack_info': []} + 'stack_info': [], + 'state': ['input', 'input', 'output']} base_dir1 = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'test_acc_compare_utils1') base_dir2 = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'test_acc_compare_utils2') def create_json_files(base_dir): - file_names = ['dump.json', 'stack.json', 'construct.json'] + file_names = ['dump.json', 'stack.json', 'construct.json', 'debug.json'] for file_name in file_names: file_path = os.path.join(base_dir, file_name) @@ -339,12 +342,15 @@ class TestUtilsMethods(unittest.TestCase): def test_extract_json_1(self): create_json_files(base_dir1) - result = extract_json(base_dir1, stack_json=False) + result = extract_json(base_dir1, Const.DUMP_JSON_FILE) self.assertEqual(result, os.path.join(base_dir1, 'dump.json')) - result = extract_json(base_dir1, stack_json=True) + result = extract_json(base_dir1, Const.STACK_JSON_FILE) self.assertEqual(result, os.path.join(base_dir1, 'stack.json')) + result = extract_json(base_dir1, Const.DEBUG_JSON_FILE) + self.assertEqual(result, os.path.join(base_dir1, 'debug.json')) + def test_check_and_return_dir_contents(self): create_rank_dirs(base_dir2) result = check_and_return_dir_contents(base_dir2, 'rank') @@ -359,12 +365,12 @@ class TestUtilsMethods(unittest.TestCase): self.assertEqual(result, op_result_b) def test_op_item_parse(self): - result = op_item_parse(parse_item, parse_op_name) + result = op_item_parse(parse_item, parse_op_name, 'input') self.assertEqual(result, o_result_parse) def test_op_item_parse_max_depth(self): with self.assertRaises(CompareException) as context: - op_item_parse(parse_item, parse_op_name, depth=11) + op_item_parse(parse_item, parse_op_name, 'input', depth=11) self.assertEqual(context.exception.code, CompareException.RECURSION_LIMIT_ERROR) def test_get_rela_diff_summary_mode_float_or_int(self): @@ -550,57 +556,34 @@ class TestUtilsMethods(unittest.TestCase): self.assertFalse(result) -class TestGetNameAndState(unittest.TestCase): - def test_valid_forward_input(self): - name = 'conv2d.forward.1.input.0' - expected_api = 'conv2d.forward.1.' - expected_state = 'input' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_valid_backward_output(self): - name = 'Functional.pad.0.backward.output.0' - expected_api = 'Functional.pad.0.backward.' - expected_state = 'output' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_valid_with_kwargs(self): - name = 'layer.norm.2.forward.kwargs.attr' - expected_api = 'layer.norm.2.forward.' - expected_state = 'kwargs' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_no_numeric_index(self): - name = 'conv2d.forward.input.0' - expected_api = 'conv2d.forward.' - expected_state = 'input' - self.assertEqual(get_name_and_state(name), (expected_api, expected_state)) - - def test_invalid__state(self): - name = 'conv2d.forward.1.invalidstate.0' - with self.assertRaises(CompareException) as context: - get_name_and_state(name) - self.assertIn('Invalid name string', str(context.exception.code)) - - class TestReorderOpNameList(unittest.TestCase): def test_reorder_op_name_list(self): # 标准顺序 op_name_list = ["op.forward.input.0.0", "op.forward.output.0", "op.forward.output.1", "op.forward.parameters.1", "op.forward.parameters.2", "op.parameters_grad.0"] - result = reorder_op_name_list(op_name_list) - expected = ["op.forward.input.0.0", "op.forward.parameters.1", "op.forward.parameters.2", "op.forward.output.0", "op.forward.output.1", "op.parameters_grad.0"] - self.assertEqual(result, expected) + state_list = ["input", "output", "output", "parameters", "parameters", "parameters_grad"] + op_name_reorder, state_reorder = reorder_op_name_list(op_name_list, state_list) + expected_result = ["op.forward.input.0.0", "op.forward.parameters.1", "op.forward.parameters.2", "op.forward.output.0", "op.forward.output.1", "op.parameters_grad.0"] + expected_state = ["input", "parameters", "parameters", "output", "output", "parameters_grad"] + self.assertEqual(op_name_reorder, expected_result) + self.assertEqual(state_reorder, expected_state) # 只有输入元素 op_name_list = ["op.forward.input.0", "op.forward.input.1"] - result = reorder_op_name_list(op_name_list) - expected = ["op.forward.input.0", "op.forward.input.1"] - self.assertEqual(result, expected) + state_list = ["input", "input"] + op_name_reorder, state_reorder = reorder_op_name_list(op_name_list, state_list) + expected_result = ["op.forward.input.0", "op.forward.input.1"] + expected_state = ["input", "input"] + self.assertEqual(op_name_reorder, expected_result) + self.assertEqual(state_reorder, expected_state) # 输入为空 op_name_list = [] - result = reorder_op_name_list(op_name_list) - expected = [] - self.assertEqual(result, expected) + state_list = [] + op_name_reorder, state_reorder = reorder_op_name_list(op_name_list, state_list) + expected_result = [] + expected_state = [] + self.assertEqual(op_name_reorder, expected_result) + self.assertEqual(state_reorder, expected_state) class TestReorderOpXList(unittest.TestCase): @@ -609,37 +592,45 @@ class TestReorderOpXList(unittest.TestCase): op_name_list = ["op.forward.input.0", "op.forward.output.0", "op.forward.parameters.weight"] summary_list = ["summary1", "summary2", "summary3"] data_name_list = ["data1", "data2", "data3"] - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = ["input", "output", "parameters"] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, ["op.forward.input.0", "op.forward.parameters.weight", "op.forward.output.0"]) self.assertEqual(result_summary, ["summary1", "summary3", "summary2"]) self.assertEqual(result_data_name, ["data1", "data3", "data2"]) + self.assertEqual(result_state, ["input", "parameters", "output"]) # 空 op_name_list 或 summary_list op_name_list = [] summary_list = [] data_name_list = ["data1", "data2", "data3"] - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = [] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, []) self.assertEqual(result_summary, []) self.assertEqual(result_data_name, ["data1", "data2", "data3"]) + self.assertEqual(result_state, []) # 空 data_name_list op_name_list = ["op.forward.input.0", "op.forward.output.0", "op.forward.parameters.weight"] summary_list = ["summary1", "summary2", "summary3"] data_name_list = [] - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = ["input", "output", "parameters"] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, ["op.forward.input.0", "op.forward.parameters.weight", "op.forward.output.0"]) self.assertEqual(result_summary, ["summary1", "summary3", "summary2"]) self.assertEqual(result_data_name, []) + self.assertEqual(result_state, ["input", "parameters", "output"]) # data_name_list 为 None op_name_list = ["op.forward.input.0", "op.forward.output.0", "op.forward.parameters.weight"] summary_list = ["summary1", "summary2", "summary3"] data_name_list = None - result_op_name, result_summary, result_data_name = reorder_op_x_list(op_name_list, summary_list, data_name_list) + state_list = ["input", "output", "parameters"] + result_op_name, result_summary, result_data_name, result_state = reorder_op_x_list(op_name_list, summary_list, data_name_list, state_list) self.assertEqual(result_op_name, ["op.forward.input.0", "op.forward.parameters.weight", "op.forward.output.0"]) self.assertEqual(result_summary, ["summary1", "summary3", "summary2"]) self.assertEqual(result_data_name, None) + self.assertEqual(result_state, ["input", "parameters", "output"]) class TestGenOpItem(unittest.TestCase): @@ -657,7 +648,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['data_name'], 'test_data') self.assertEqual(result['full_op_name'], 'test_data') @@ -668,6 +659,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Mean'], 2) self.assertEqual(result['Norm'], 2) self.assertEqual(result['md5'], f"{zlib.crc32(str(op_data['value']).encode()):08x}") + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_empty_data_name(self): op_data = { @@ -677,11 +669,12 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') # data_name为空时,应该被设置为'-1' self.assertEqual(result['data_name'], '-1') self.assertEqual(result['full_op_name'], op_name) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_none_data_name(self): op_data = { @@ -691,11 +684,12 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') # data_name为None时,应该被设置为'-1' self.assertEqual(result['data_name'], '-1') self.assertEqual(result['full_op_name'], op_name) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_torch_size(self): op_data = { @@ -705,7 +699,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'torch.Size') self.assertEqual(result['shape'], '[2, 3, 4]') @@ -713,6 +707,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Min'], None) self.assertEqual(result['Mean'], None) self.assertEqual(result['Norm'], None) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_slice(self): op_data = { @@ -722,10 +717,11 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'slice') self.assertEqual(result['shape'], str(np.shape(np.array(op_data['value'])))) + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_ellipsis(self): op_data = { @@ -735,7 +731,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'ellipsis') self.assertEqual(result['shape'], '[]') @@ -743,6 +739,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Min'], '...') self.assertEqual(result['Mean'], '...') self.assertEqual(result['Norm'], '...') + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_type_torch_process_group(self): op_data = { @@ -752,7 +749,7 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], 'torch.ProcessGroup') self.assertEqual(result['shape'], '[]') @@ -760,6 +757,7 @@ class TestGenOpItem(unittest.TestCase): self.assertEqual(result['Min'], '[0, 1]') self.assertEqual(result['Mean'], '[0, 1]') self.assertEqual(result['Norm'], '[0, 1]') + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_default_dtype(self): op_data = { @@ -769,10 +767,11 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') self.assertEqual(result['dtype'], str(type(op_data['value']))) self.assertEqual(result['shape'], '[]') + self.assertEqual(result['state'], 'input') def test_gen_op_item_with_md5(self): op_data = { @@ -782,89 +781,8 @@ class TestGenOpItem(unittest.TestCase): } op_name = 'op_test' - result = gen_op_item(op_data, op_name) + result = gen_op_item(op_data, op_name, 'input') expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}" self.assertEqual(result['md5'], expected_md5) - - -class TestApiBatch(unittest.TestCase): - def test_ApiBatch_increment_input(self): - api_name = "functional.conv2d" - start = 2 - api_batch = ApiBatch(api_name, start) - - api_batch.increment(Const.INPUT) - - self.assertEqual(api_batch._state, Const.INPUT) - self.assertEqual(api_batch.input_len, 2) - self.assertEqual(api_batch.params_end_index, 4) - self.assertEqual(api_batch.output_end_index, 4) - self.assertEqual(api_batch.params_grad_end_index, 4) - - def test_ApiBatch_increment_output(self): - api_name = "functional.conv2d" - start = 2 - api_batch = ApiBatch(api_name, start) - - api_batch.increment(Const.OUTPUT) - - self.assertEqual(api_batch._state, Const.OUTPUT) - self.assertEqual(api_batch.input_len, 1) - self.assertEqual(api_batch.params_end_index, 3) - self.assertEqual(api_batch.output_end_index, 4) - self.assertEqual(api_batch.params_grad_end_index, 4) - - def test_ApiBatch_increment_kwargs(self): - api_name = "functional.conv2d" - start = 2 - api_batch = ApiBatch(api_name, start) - - api_batch.increment(Const.KWARGS) - - self.assertEqual(api_batch._state, Const.KWARGS) - self.assertEqual(api_batch.input_len, 2) - self.assertEqual(api_batch.params_end_index, 4) - self.assertEqual(api_batch.output_end_index, 4) - self.assertEqual(api_batch.params_grad_end_index, 4) - - def test_ApiBatch_increment_params(self): - api_name = "functional.conv2d" - start = 2 - api_batch = ApiBatch(api_name, start) - - api_batch.increment(Const.PARAMS) - - self.assertEqual(api_batch._state, Const.PARAMS) - self.assertEqual(api_batch.input_len, 1) - self.assertEqual(api_batch.params_end_index, 4) - self.assertEqual(api_batch.output_end_index, 4) - self.assertEqual(api_batch.params_grad_end_index, 4) - - def test_ApiBatch_increment_multiple_input(self): - api_name = "functional.conv2d" - start = 2 - api_batch = ApiBatch(api_name, start) - - api_batch.increment(Const.INPUT) - api_batch.increment(Const.INPUT) - - self.assertEqual(api_batch._state, Const.INPUT) - self.assertEqual(api_batch.input_len, 3) - self.assertEqual(api_batch.params_end_index, 5) - self.assertEqual(api_batch.output_end_index, 5) - self.assertEqual(api_batch.params_grad_end_index, 5) - - def test_ApiBatch_increment_multiple_output(self): - api_name = "functional.conv2d" - start = 2 - api_batch = ApiBatch(api_name, start) - - api_batch.increment(Const.OUTPUT) - api_batch.increment(Const.OUTPUT) - - self.assertEqual(api_batch._state, Const.OUTPUT) - self.assertEqual(api_batch.input_len, 1) - self.assertEqual(api_batch.params_end_index, 3) - self.assertEqual(api_batch.output_end_index, 5) - self.assertEqual(api_batch.params_grad_end_index, 5) + self.assertEqual(result['state'], 'input') diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py index 5d01c3fdcb..933c846013 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py @@ -13,10 +13,9 @@ from openpyxl import load_workbook from openpyxl.styles import PatternFill from msprobe.core.common.const import CompareConst, Const -from msprobe.core.compare.highlight import CheckMaxRelativeDiff, CheckOrderMagnitude, \ +from msprobe.core.compare.highlight import ApiBatch, CheckMaxRelativeDiff, CheckOrderMagnitude, \ CheckOneThousandErrorRatio, CheckCosineSimilarity, add_highlight_row_info, HighLight from msprobe.core.compare.config import ModeConfig -from msprobe.core.compare.utils import ApiBatch summary_line_input = ['Functional_batch_norm_0_forward.input.0', 'Functional_batch_norm_0_forward.input.0', @@ -38,19 +37,19 @@ summary_line_3 = ['Functional_batch_norm_0_forward.output.2', 'Functional_batch_ line_input = ['Functional.batch.norm.0.forward.input.0', 'Functional.batch.norm.0.forward.input.0', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 1, 0.5, 1, 1, 0.95, 1, 1, 1, 1, 1, 1.01, 1, 1, 1, - 'Yes', ''] + 'Yes', '', 'input', 'Functional.batch.norm.0.forward'] line_1 = ['Functional.batch.norm.0.forward.output.0', 'Functional.batch.norm.0.forward.output.0', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 0.5, 1, 1, 0.59, 1, 'nan', 0, 1, 1, 19, 1, 1, 1, - 'Yes', ''] + 'Yes', '', 'output', 'Functional.batch.norm.0.forward'] line_2 = ['Functional.batch.norm.0.forward.output.1', 'Functional.batch.norm.0.forward.output.1', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.9, 0.5, 1, 1, 0.8, 1, 0, 0.12, 0, 1, 1, 0.1, 1, 1, - 'Yes', ''] + 'Yes', '', 'output', 'Functional.batch.norm.0.forward'] line_3 = ['Functional.batch.norm.0.forward.output.2', 'Functional.batch.norm.0.forward.output.2', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 0.5, 1.1e+10, 1, 0.85, 1, 9, 0.12, 0, 1, 1, 0.1, 1, 1, - 'Yes', ''] + 'Yes', '', 'output', 'Functional.batch.norm.0.forward'] base_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'test_highlight') @@ -211,6 +210,87 @@ class TestUtilsMethods(unittest.TestCase): result = CheckMaxRelativeDiff().apply(info, color_columns, dump_mode=Const.SUMMARY) self.assertEqual(result, None) + def test_ApiBatch_increment_input(self): + api_name = "functional.conv2d" + start = 2 + api_batch = ApiBatch(api_name, start) + + api_batch.increment(Const.INPUT) + + self.assertEqual(api_batch._state, Const.INPUT) + self.assertEqual(api_batch.input_len, 2) + self.assertEqual(api_batch.params_end_index, 4) + self.assertEqual(api_batch.output_end_index, 4) + self.assertEqual(api_batch.params_grad_end_index, 4) + + def test_ApiBatch_increment_output(self): + api_name = "functional.conv2d" + start = 2 + api_batch = ApiBatch(api_name, start) + + api_batch.increment(Const.OUTPUT) + + self.assertEqual(api_batch._state, Const.OUTPUT) + self.assertEqual(api_batch.input_len, 1) + self.assertEqual(api_batch.params_end_index, 3) + self.assertEqual(api_batch.output_end_index, 4) + self.assertEqual(api_batch.params_grad_end_index, 4) + + def test_ApiBatch_increment_kwargs(self): + api_name = "functional.conv2d" + start = 2 + api_batch = ApiBatch(api_name, start) + + api_batch.increment(Const.KWARGS) + + self.assertEqual(api_batch._state, Const.KWARGS) + self.assertEqual(api_batch.input_len, 2) + self.assertEqual(api_batch.params_end_index, 4) + self.assertEqual(api_batch.output_end_index, 4) + self.assertEqual(api_batch.params_grad_end_index, 4) + + def test_ApiBatch_increment_params(self): + api_name = "functional.conv2d" + start = 2 + api_batch = ApiBatch(api_name, start) + + api_batch.increment(Const.PARAMS) + + self.assertEqual(api_batch._state, Const.PARAMS) + self.assertEqual(api_batch.input_len, 1) + self.assertEqual(api_batch.params_end_index, 4) + self.assertEqual(api_batch.output_end_index, 4) + self.assertEqual(api_batch.params_grad_end_index, 4) + + def test_ApiBatch_increment_multiple_input(self): + api_name = "functional.conv2d" + start = 2 + api_batch = ApiBatch(api_name, start) + + api_batch.increment(Const.INPUT) + api_batch.increment(Const.INPUT) + + self.assertEqual(api_batch._state, Const.INPUT) + self.assertEqual(api_batch.input_len, 3) + self.assertEqual(api_batch.params_end_index, 5) + self.assertEqual(api_batch.output_end_index, 5) + self.assertEqual(api_batch.params_grad_end_index, 5) + + def test_ApiBatch_increment_multiple_output(self): + api_name = "functional.conv2d" + start = 2 + api_batch = ApiBatch(api_name, start) + + api_batch.increment(Const.OUTPUT) + api_batch.increment(Const.OUTPUT) + + self.assertEqual(api_batch._state, Const.OUTPUT) + self.assertEqual(api_batch.input_len, 1) + self.assertEqual(api_batch.params_end_index, 3) + self.assertEqual(api_batch.output_end_index, 5) + self.assertEqual(api_batch.params_grad_end_index, 5) + + def test_find_error_rows_normal(self): compare_result = np.array([ ["Functional.linear.0.forward.input.0", "Functional.linear.0.forward.input.0", diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index 77920cdd98..c30c0c16de 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -23,7 +23,7 @@ o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.i 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, - 'None', 'NPU does not have data file.', ['-1', '-1']]] + 'None', 'Dump file: None not found or read failed.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -103,7 +103,7 @@ class TestCompareRealData(unittest.TestCase): # index error with self.assertRaises(CompareException) as context: result = compare_real_data.read_dump_data(pd.DataFrame()) - self.assertEqual(context.exception.code, CompareException.INDEX_OUT_OF_BOUNDS_ERROR) + self.assertEqual(context.exception.code, CompareException.INVALID_KEY_ERROR) def test_save_cmp_result_success(self): file_reader = read_real_data @@ -160,7 +160,7 @@ class TestCompareRealData(unittest.TestCase): input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'NPU does not have data file.']) + 'unsupported', 'Dump file: None not found or read failed.']) pt_name = 'Functional.linear.0.forward.input.0.pt' op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} @@ -186,7 +186,7 @@ class TestCompareRealData(unittest.TestCase): result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'NPU does not have data file.']) + 'unsupported', 'Dump file: None not found or read failed.']) def test_compare_ops(self): generate_dump_json(base_dir3) @@ -221,7 +221,7 @@ class TestCompareRealData(unittest.TestCase): o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'NPU does not have data file.', ['-1', '-1']]] + 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'Dump file: None not found or read failed.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -235,3 +235,16 @@ class TestCompareRealData(unittest.TestCase): result = compare_real_data.do_multi_process(input_param, result_df) self.assertTrue(result.equals(o_result)) + + def test_handle_multi_process(self): + file_reader = read_real_data + mode_config = ModeConfig(dump_mode=Const.ALL) + cross_frame = False + compare_real_data = CompareRealData(file_reader, mode_config, cross_frame) + + func = compare_real_data.compare_ops + generate_dump_json(base_dir) + input_param = {'bench_json_path': os.path.join(base_dir, 'dump.json')} + lock = multiprocessing.Manager().RLock() + result = compare_real_data._handle_multi_process(func, input_param, result_df, lock) + self.assertTrue(result.equals(o_result)) -- Gitee From a56126d8e7734b314325334b548b71df64c8b5e4 Mon Sep 17 00:00:00 2001 From: i-robot Date: Fri, 18 Jul 2025 08:25:16 +0000 Subject: [PATCH 2/4] compare highlight switch poc sync --- .../msprobe/core/common/const.py | 2 +- .../msprobe/core/common/utils.py | 11 +-- .../msprobe/core/compare/acc_compare.py | 27 ++++--- .../msprobe/core/compare/compare_cli.py | 2 + .../msprobe/core/compare/config.py | 14 ++-- .../msprobe/core/compare/highlight.py | 18 +---- .../msprobe/core/compare/utils.py | 4 +- .../docs/10.accuracy_compare_PyTorch.md | 31 ++++---- .../docs/11.accuracy_compare_MindSpore.md | 7 +- .../msprobe/mindspore/compare/ms_compare.py | 12 ++- .../msprobe/pytorch/compare/pt_compare.py | 11 ++- .../test/core_ut/compare/test_acc_compare.py | 73 +++++++++++-------- .../core_ut/compare/test_acc_compare_utils.py | 14 ++-- .../visualization/builder/msprobe_adapter.py | 8 +- 14 files changed, 137 insertions(+), 97 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 964c90b7cf..a17288c87f 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -445,7 +445,7 @@ class CompareConst: SUMMARY = "summary" COMPARE_RESULT = "compare_result" COMPARE_MESSAGE = "compare_message" - MAX_EXCEL_LENGTH = 1048576 + MAX_EXCEL_LENGTH = 1048500 YES = "Yes" NO = "No" STATISTICS_INDICATOR_NUM = 4 diff --git a/debug/accuracy_tools/msprobe/core/common/utils.py b/debug/accuracy_tools/msprobe/core/common/utils.py index 04e69dccbb..f4ab06070a 100644 --- a/debug/accuracy_tools/msprobe/core/common/utils.py +++ b/debug/accuracy_tools/msprobe/core/common/utils.py @@ -156,9 +156,10 @@ def check_compare_param(input_param, output_path, dump_mode, stack_mode): _check_json(stack_json, input_param.get("stack_json_path")) -def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False, is_print_compare_log=True): - arg_list = [stack_mode, auto_analyze, fuzzy_match, is_print_compare_log] - arg_names = ['stack_mode', 'auto_analyze', 'fuzzy_match', 'is_print_compare_log'] +def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False, highlight=False, + is_print_compare_log=True): + arg_list = [stack_mode, auto_analyze, fuzzy_match, highlight, is_print_compare_log] + arg_names = ['stack_mode', 'auto_analyze', 'fuzzy_match', 'highlight', 'is_print_compare_log'] for arg, name in zip(arg_list, arg_names): if not isinstance(arg, bool): logger.error(f"Invalid input parameter, {name} which should be only bool type.") @@ -464,10 +465,10 @@ def get_real_step_or_rank(step_or_rank_input, obj): def check_init_step(step): if not is_int(step): raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, - f"{step} must be an integer") + f"{step} must be an integer") if not step >= 0: raise MsprobeException(MsprobeException.INVALID_PARAM_ERROR, - f"{step} must be greater than or equal to 0") + f"{step} must be greater than or equal to 0") def check_token_range(token_range): diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 7b34b73d2e..1100cbe213 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -25,7 +25,7 @@ from tqdm import tqdm from msprobe.core.advisor.advisor import Advisor from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import load_json, remove_path, create_directory +from msprobe.core.common.file_utils import load_json, remove_path, create_directory, save_excel from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, add_time_with_xlsx, check_op_str_pattern_valid, \ set_dump_path, get_dump_mode, check_compare_param, check_configuration_param, load_stack_json, get_file_type @@ -43,6 +43,7 @@ class ComparisonConfig: stack_mode: bool auto_analyze: bool fuzzy_match: bool + highlight: bool data_mapping: dict suffix: str cell_mapping: dict @@ -113,13 +114,20 @@ class Comparator: compare_real_data = CompareRealData(self.file_reader, self.mode_config, self.cross_frame) result_df = compare_real_data.do_multi_process(input_param, result_df) - # highlight suspicious API - highlight_dict = {"red_rows": set(), "yellow_rows": set(), "red_lines": [], "yellow_lines": []} - highlight = HighLight(self.mode_config) - if self.mode_config.compared_file_type == Const.DUMP_JSON_FILE: - highlight.find_compare_result_error_rows(result_df, highlight_dict) - result_df.drop(columns=['state', 'api_origin_name'], inplace=True) # 删除中间数据,两列不落盘 - highlight.highlight_rows_xlsx(result_df, highlight_dict, file_path) + # save result excel file + logger.info(f'Saving result excel file in progress. The file path is: {file_path}.') + if self.mode_config.highlight and len(result_df) <= CompareConst.MAX_EXCEL_LENGTH: + # highlight if not too long + highlight_dict = {"red_rows": set(), "yellow_rows": set(), "red_lines": [], "yellow_lines": []} + highlight = HighLight(self.mode_config) + if self.mode_config.compared_file_type == Const.DUMP_JSON_FILE: + highlight.find_compare_result_error_rows(result_df, highlight_dict) + result_df.drop(columns=['state', 'api_origin_name'], inplace=True) # 删除中间数据,两列不落盘 + highlight.highlight_rows_xlsx(result_df, highlight_dict, file_path) + else: + # fallback to simple save without highlight + result_df.drop(columns=['state', 'api_origin_name'], inplace=True) # 删除中间数据,两列不落盘 + save_excel(file_path, result_df) # output compare analysis suggestions if self.mode_config.auto_analyze: @@ -729,6 +737,7 @@ def setup_comparison(input_param, output_path, **kwargs) -> ComparisonConfig: stack_mode=False, auto_analyze=kwargs.get('auto_analyze', True), fuzzy_match=kwargs.get('fuzzy_match', False), + highlight=kwargs.get('highlight', False), data_mapping=kwargs.get('data_mapping', {}), suffix=kwargs.get('suffix', ''), cell_mapping=kwargs.get('cell_mapping', {}), @@ -747,7 +756,7 @@ def setup_comparison(input_param, output_path, **kwargs) -> ComparisonConfig: else: config.stack_mode = set_stack_json_path(input_param) - check_configuration_param(config.stack_mode, config.auto_analyze, config.fuzzy_match, + check_configuration_param(config.stack_mode, config.auto_analyze, config.fuzzy_match, config.highlight, input_param.get('is_print_compare_log', True)) create_directory(output_path) check_compare_param(input_param, output_path, config.dump_mode, config.stack_mode) diff --git a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py index 91892a1aa8..bbe93c745b 100644 --- a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py +++ b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py @@ -35,6 +35,7 @@ def compare_cli(args): raise CompareException(CompareException.INVALID_PATH_ERROR) frame_name = args.framework auto_analyze = not args.compare_only + if frame_name == Const.PT_FRAMEWORK: from msprobe.pytorch.compare.pt_compare import compare from msprobe.pytorch.compare.distributed_compare import compare_distributed @@ -46,6 +47,7 @@ def compare_cli(args): common_kwargs = { "auto_analyze": auto_analyze, "fuzzy_match": args.fuzzy_match, + "highlight": args.highlight, "data_mapping": args.data_mapping, } diff --git a/debug/accuracy_tools/msprobe/core/compare/config.py b/debug/accuracy_tools/msprobe/core/compare/config.py index 18ef50c26d..2a6c968072 100644 --- a/debug/accuracy_tools/msprobe/core/compare/config.py +++ b/debug/accuracy_tools/msprobe/core/compare/config.py @@ -20,13 +20,13 @@ from msprobe.core.common.file_utils import load_yaml class ModeConfig: - def __init__(self, stack_mode=False, auto_analyze=True, fuzzy_match=False, dump_mode=Const.SUMMARY, - compared_file_type=Const.DUMP_JSON_FILE): - self.stack_mode = stack_mode - self.auto_analyze = auto_analyze - self.fuzzy_match = fuzzy_match - self.dump_mode = dump_mode - self.compared_file_type = compared_file_type + def __init__(self, **kwargs): + self.stack_mode = kwargs.get('stack_mode', False) + self.auto_analyze = kwargs.get('auto_analyze', True) + self.fuzzy_match = kwargs.get('fuzzy_match', False) + self.highlight = kwargs.get('highlight', False) + self.dump_mode = kwargs.get('dump_mode', Const.SUMMARY) + self.compared_file_type = kwargs.get('compared_file_type', Const.DUMP_JSON_FILE) class MappingConfig: diff --git a/debug/accuracy_tools/msprobe/core/compare/highlight.py b/debug/accuracy_tools/msprobe/core/compare/highlight.py index 37eec42169..9e5fe0559c 100644 --- a/debug/accuracy_tools/msprobe/core/compare/highlight.py +++ b/debug/accuracy_tools/msprobe/core/compare/highlight.py @@ -349,28 +349,19 @@ class HighLight: self.update_highlight_err_msg(result_df, highlight_dict) # add highlight err_msg - wb = openpyxl.Workbook() - ws = wb.active - - # write header - logger.info('Initializing Excel file.') - self.handle_multi_process_malicious_value_check(self.df_malicious_value_check, result_df) + wb = openpyxl.Workbook() + ws = wb.active result_df_convert = result_df.applymap(self.compare_result_df_convert) - for row in dataframe_to_rows(result_df_convert, index=False, header=True): ws.append(row) # 对可疑数据标色 logger.info('Coloring Excel in progress.') + red_fill = PatternFill(start_color=CompareConst.RED, end_color=CompareConst.RED, fill_type="solid") + yellow_fill = PatternFill(start_color=CompareConst.YELLOW, end_color=CompareConst.YELLOW, fill_type="solid") col_len = len(result_df.columns) - red_fill = PatternFill( - start_color=CompareConst.RED, end_color=CompareConst.RED, fill_type="solid" - ) - yellow_fill = PatternFill( - start_color=CompareConst.YELLOW, end_color=CompareConst.YELLOW, fill_type="solid", - ) for i in highlight_dict.get("red_rows", []): for j in range(1, col_len + 1): ws.cell(row=i + 2, column=j).fill = red_fill # 2因为ws.cell中的row或column需要>=1,数据从第2行开始 @@ -378,7 +369,6 @@ class HighLight: for j in range(1, col_len + 1): ws.cell(row=i + 2, column=j).fill = yellow_fill - logger.info('Saving Excel file to disk: %s' % file_path) save_workbook(wb, file_path) def handle_multi_process_malicious_value_check(self, func, result_df): diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 0f5ea4a50d..1f028fac6a 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -158,7 +158,7 @@ def is_leaf_data(op_data): def gen_op_item(op_data, op_name, state): op_item = {} - op_item.update(op_data) + op_item.update({key: str(value) if isinstance(value, bool) else value for key, value in op_data.items()}) data_name = op_data.get(Const.DATA_NAME) if op_data.get(Const.DATA_NAME) else '-1' # 如果是""也返回-1 op_item[Const.DATA_NAME] = data_name op_item['full_op_name'] = data_name.rsplit(Const.SEP, 1)[0] if data_name != '-1' else op_name @@ -543,6 +543,8 @@ def _compare_parser(parser): help=" Whether to give advisor.", required=False) parser.add_argument("-f", "--fuzzy_match", dest="fuzzy_match", action="store_true", help=" Whether to perform a fuzzy match on the api name.", required=False) + parser.add_argument("-hl", "--highlight", dest="highlight", action="store_true", + help=" Whether to set result highlighting.", required=False) parser.add_argument("-cm", "--cell_mapping", dest="cell_mapping", type=str, nargs='?', const=True, help=" The cell mapping file path.", required=False) parser.add_argument("-am", "--api_mapping", dest="api_mapping", type=str, nargs='?', const=True, diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index 6727e01fe0..18b78aa212 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -51,15 +51,16 @@ msprobe -f pytorch compare -i ./compare.json -o ./output -s 完整参数说明: -| 参数名 | 说明 | 是否必选 | -|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | -| -f 或 --framework | 指定训练框架。pytorch。 | 是 | -| -i 或 --input_path | 指定[比对文件](#51-比对文件),str 类型。 | 是 | -| -o 或 --output_path | 配置比对结果文件存盘目录,str 类型,默认在当前目录创建output目录。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 否 | -| -s 或 --stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,根据[比对文件](#51-比对文件)的参数说明配置stack_path;多卡场景开启时,自动识别npu_dump目录下stack.json文件,如存在生成详细调用栈信息,否则不生成,此参数不生效。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | +| 参数名 | 说明 | 是否必选 | +|---------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | +| -f 或 --framework | 指定训练框架。pytorch。 | 是 | +| -i 或 --input_path | 指定[比对文件](#51-比对文件),str 类型。 | 是 | +| -o 或 --output_path | 配置比对结果文件存盘目录,str 类型,默认在当前目录创建output目录。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 否 | +| -s 或 --stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,根据[比对文件](#51-比对文件)的参数说明配置stack_path;多卡场景开启时,自动识别npu_dump目录下stack.json文件,如存在生成详细调用栈信息,否则不生成,此参数不生效。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | | -c 或 --compare_only | 仅比对开关,bool 类型。该参数默认未配置,会启用自动精度分析,工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 `advisor_{timestamp}.txt` 文件)。通过配置该参数取消自动精度分析,仅输出比对结果表格。 | 否 | -| -f 或 --fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | -| -dm或--data_mapping | 自定义映射关系比对。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件](#52-自定义映射文件)。仅[API和模块无法自动匹配场景](#213-api和模块无法自动匹配场景)需要配置。仅支持逐卡比对,即使用[比对文件](#51-比对文件)的单卡场景示例。 | 否 | +| -f 或 --fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | +| -hl 或 --highlight | 高亮颜色标记。开启后,比对结果件中通过红色或黄色标记精度可疑API或模块。通过直接配置该参数开启,默认未配置,表示关闭。 开启高亮颜色标记后,比对性能降低,如果比对结果行数超出excel单页限制,程序强制关闭高亮颜色标记。 | 否 | +| -dm或--data_mapping | 自定义映射关系比对。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件](#52-自定义映射文件)。仅[API和模块无法自动匹配场景](#213-api和模块无法自动匹配场景)需要配置。仅支持逐卡比对,即使用[比对文件](#51-比对文件)的单卡场景示例。 | 否 | #### 2.1.2 整网比对场景 @@ -134,13 +135,14 @@ compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_mat **参数说明**: -| 参数名 | 说明 | 是否必选 | -| ------------ |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | +| 参数名 | 说明 | 是否必选 | +|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | | input_param | 配置 dump 数据文件及目录,dict 类型。配置参数包括:
"npu_json_path":指定 NPU dump 目录下的 dump.json 文件。
**配置示例**:"npu_json_path": "./npu_dump/dump.json"。
"bench_json_path":指定 CPU、GPU 或 NPU dump 目录下的 dump.json 文件。
**配置示例**:"bench_json_path": "./bench_dump/dump.json"。
"stack_json_path":指定 NPU dump 目录下的 stack.json 文件。
**配置示例**:"stack_json_path": "./npu_dump/stack.json"。
"is_print_compare_log":配置是否开启单个算子的日志打屏。
**配置示例**:True 或 False。 | 是 | -| output_path | 配置比对结果文件存盘目录,str 类型。
**配置示例**:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 是 | -| stack_mode | 配置 stack_mode 的开关,bool 类型。仅当配置 stack_json_path 时需要,开启时比对结果呈现NPU_Stack_Info,关闭时不呈现。当不配置stack_json_path 时,自动识别是否存在stack.json,存在时呈现NPU_Stack_Info,否则不呈现。
**配置示例**:stack_mode=True,默认为 False。 | 否 | -| auto_analyze | 自动精度分析,bool 类型。开启后工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 advisor_{timestamp}.txt 文件)。
**配置示例**:auto_analyze=False,默认为 True。 | 否 | -| fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。
**配置示例**:fuzzy_match=True,默认为 False。 | 否 | +| output_path | 配置比对结果文件存盘目录,str 类型。
**配置示例**:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。
提示:output目录下与结果件同名文件将被删除覆盖。 | 是 | +| stack_mode | 配置 stack_mode 的开关,bool 类型。仅当配置 stack_json_path 时需要,开启时比对结果呈现NPU_Stack_Info,关闭时不呈现。当不配置stack_json_path 时,自动识别是否存在stack.json,存在时呈现NPU_Stack_Info,否则不呈现。
**配置示例**:stack_mode=True,默认为 False。 | 否 | +| auto_analyze | 自动精度分析,bool 类型。开启后工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 advisor_{timestamp}.txt 文件)。
**配置示例**:auto_analyze=False,默认为 True。 | 否 | +| fuzzy_match | 模糊匹配,bool 类型。开启后,对于网络中同一层级且命名仅调用次数不同的 API,可匹配并进行比对。
**配置示例**:fuzzy_match=True,默认为 False。 | 否 | +| highlight | 高亮颜色标记。开启后,比对结果件中通过红色或黄色标记精度可疑API或模块。 开启高亮颜色标记后,比对性能降低,如果比对结果行数超出excel单页限制,程序强制关闭高亮颜色标记。
**配置示例**:highlight=True,默认为 False。 | 否 | **函数示例**: @@ -221,6 +223,7 @@ PyTorch 精度比对是以 CPU 或 GPU 的计算结果为标杆,通过计算 ### 3.2 颜色标记——真实数据模式、统计数据模式 +通过在命令行中配置-hl或--highlight开启,或者在比对函数中配置参数highlight=True开启,用于标记精度可疑API或模块。开启后,比对性能会有降低,建议比对较大dump.json文件时不配置此参数。 在比对结果中的Err_message列呈现比对结果颜色标记的原因,具体含义如下: 红色标记情况: diff --git a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md index 55a148058a..dcd53eab47 100644 --- a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md @@ -35,14 +35,15 @@ msprobe -f mindspore compare -i ./compare.json -o ./output -s **完整参数说明** -| 参数名 | 说明 | 是否必选 | -| -------------------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | -| -f 或 --framework | 指定训练框架。mindspore。 | 是 | +| 参数名 | 说明 | 是否必选 | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | +| -f 或 --framework | 指定训练框架。mindspore。 | 是 | | -i或--input_path | 指定比对文件。比对文件内容及示例请参见[比对文件](#41-比对文件)或[比对文件(kernel)](#42-比对文件kernel)(比对文件(kernel)仅[不同版本下的全量kernel比对](#23-不同版本下的全量kernel比对)场景支持)。 | 是 | | -o或--output_path | 配置比对结果文件存盘目录,默认会在当前目录创建output目录。文件名称基于时间戳自动生成,格式为:
`compare_result_{timestamp}.xlsx`
`compare_result_{rank_id}_{step_id}_{timestamp}.xlsx`(仅[不同版本下的全量kernel比对](#23-不同版本下的全量kernel比对)场景支持)。
提示:output目录下与结果件同名文件将被删除覆盖。 | 否 | | -s或--stack_mode | 比对结果展示调用栈信息(NPU_Stack_Info)的开关,bool 类型。单卡场景开启时,需要使用[比对文件](#41-比对文件)的单卡场景配置stack_path指定stack.json文件,才能生成详细调用栈信息,否则在比对时会报错;暂不支持多卡场景。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | | -c或--compare_only | 仅比对开关,bool 类型。该参数默认未配置,会启用自动精度分析,工具自动针对比对结果进行分析,识别到第一个精度可能不达标节点(在比对结果文件中的 Accuracy Reached or Not 列显示为 No),并给出问题可能产生的原因(打屏展示并生成 `advisor_{timestamp}.txt` 文件)。通过配置该参数取消自动精度分析,仅输出比对结果表格。 | 否 | | -f或--fuzzy_match | 模糊匹配。开启后,对于网络中同一层级且命名仅调用次数不同的API,可匹配并进行比对。通过直接配置该参数开启,默认未配置,表示关闭。 | 否 | +| -hl或--highlight | 高亮颜色标记。开启后,比对结果件中通过红色或黄色标记精度可疑API或模块。通过直接配置该参数开启,默认未配置,表示关闭。 开启高亮颜色标记后,比对性能降低,如果比对结果行数超出excel单页限制,程序强制关闭高亮颜色标记。 | 否 | | -am或--api_mapping | 跨框架比对。配置该参数时表示开启跨框架API比对功能,可以指定自定义映射文件*.yaml,不指定映射文件时按照msprobe定义的默认映射关系进行比对。自定义映射文件的格式请参见[自定义映射文件(api_mapping)](#43-自定义映射文件api_mapping)。仅[跨框架的API比对](#25-跨框架的api比对)场景需要配置。 | 否 | | -cm或--cell_mapping | 跨框架比对。配置该参数时表示开启跨框架cell模块比对功能,可以指定自定义映射文件*.yaml,不指定映射文件时按照msprobe定义的默认映射关系进行比对。自定义映射文件的格式请参见[自定义映射文件(cell_mapping)](#44-自定义映射文件cell_mapping)。仅[跨框架的cell模块比对](#26-跨框架的cell模块比对)场景需要配置。 | 否 | | -dm或--data_mapping | 同框架或跨框架比对。通过映射文件指定两个具体参数的对应关系,可以在L0、L1或mix采集场景下使用。配置该参数的同时需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(data_mapping)](#45-自定义映射文件data_mapping)。 | 否 | diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index 42d973a0e8..ae3dfa63d7 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -35,8 +35,16 @@ def ms_compare(input_param, output_path, **kwargs): config.data_mapping = generate_data_mapping_by_layer_mapping(input_param, config.layer_mapping, output_path) is_cross_framework = check_cross_framework(input_param.get('bench_json_path')) - mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, - config.dump_mode, config.compared_file_type) + + config_dict = { + 'stack_mode': config.stack_mode, + 'auto_analyze': config.auto_analyze, + 'fuzzy_match': config.fuzzy_match, + 'highlight': config.highlight, + 'dump_mode': config.dump_mode, + 'compared_file_type': config.compared_file_type + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig(config.cell_mapping, config.api_mapping, config.data_mapping) ms_comparator = Comparator(read_real_data, mode_config, mapping_config, is_cross_framework) ms_comparator.compare_core(input_param, output_path, suffix=config.suffix) diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py index b73c52dce3..dd948f7d61 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py @@ -31,8 +31,15 @@ def compare(input_param, output_path, **kwargs): raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) config = setup_comparison(input_param, output_path, **kwargs) - mode_config = ModeConfig(config.stack_mode, config.auto_analyze, config.fuzzy_match, - config.dump_mode, config.compared_file_type) + config_dict = { + 'stack_mode': config.stack_mode, + 'auto_analyze': config.auto_analyze, + 'fuzzy_match': config.fuzzy_match, + 'highlight': config.highlight, + 'dump_mode': config.dump_mode, + 'compared_file_type': config.compared_file_type + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig(data_mapping=config.data_mapping) pt_comparator = Comparator(read_real_data, mode_config, mapping_config) pt_comparator.compare_core(input_param, output_path, suffix=config.suffix) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py index 639be03606..1f7c515a59 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py @@ -354,22 +354,25 @@ class TestUtilsMethods(unittest.TestCase): 'state': ['input'] } - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': True, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) result = ParseData(mode_config).gen_merge_list(json_data, op_name, stack_json_data) self.assertEqual(result, merge_list) def test_check_op_item_fuzzy(self): - stack_mode = False - auto_analyze = True - dump_mode = Const.SUMMARY - - fuzzy_match = True - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': True, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) @@ -382,11 +385,13 @@ class TestUtilsMethods(unittest.TestCase): file_list = [os.path.join(base_dir, 'dump.json'), os.path.join(base_dir, 'dump.json'), os.path.join(base_dir, 'stack.json')] - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': True, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() from msprobe.pytorch.compare.pt_compare import read_real_data @@ -763,11 +768,13 @@ class TestMatch(unittest.TestCase): self.assertTrue(match_result.equals(expected)) def test_match_op_both_last_element(self): - stack_mode = False - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) @@ -776,11 +783,13 @@ class TestMatch(unittest.TestCase): self.assertEqual(b, 0) def test_match_op_only_npu_last_element(self): - stack_mode = False - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) @@ -789,11 +798,13 @@ class TestMatch(unittest.TestCase): self.assertEqual(b, 0) def test_match_op_only_bench_last_element(self): - stack_mode = False - auto_analyze = True - fuzzy_match = False - dump_mode = Const.SUMMARY - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.SUMMARY, + } + mode_config = ModeConfig(**config_dict) mapping_config = MappingConfig() match = Match(mode_config, mapping_config, cross_frame=False) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 4d264dd74c..f15dbcf6e7 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -31,15 +31,15 @@ op_name = "Tensor.add_0.0.forward" op_result = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'data_name': '-1', - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.input.0', 'state': 'input'}, + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.forward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, 'data_name': '-1', - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.forward.input.1', 'state': 'input'}, + 'Norm': 0.02844562754034996, 'requires_grad': 'False', 'full_op_name': 'Tensor.add_0.0.forward.input.1', 'state': 'input'}, {'full_op_name': 'Tensor.add_0.0.forward.input.alpha', 'dtype': "", 'shape': '[]', 'md5': '0dae4479', 'Max': -0.1, 'Min': -0.1, 'Mean': -0.1, 'Norm': -0.1, 'data_name': '-1', 'type': 'float', 'value': -0.1, 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, 'data_name': '-1', - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.forward.output.0', 'state': 'output'}] + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.forward.output.0', 'state': 'output'}] # test_read_op_1 op_data_b = { @@ -56,13 +56,13 @@ op_name_b = "Tensor.add_0.0.backward" op_result_b = [ {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.input.0', 'state': 'input'}, + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.backward.input.0', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.003992878366261721, 'Min': -0.008102823048830032, 'Mean': -0.0002002553956117481, - 'Norm': 0.02844562754034996, 'requires_grad': False, 'full_op_name': 'Tensor.add_0.0.backward.input.1', 'state': 'input'}, + 'Norm': 0.02844562754034996, 'requires_grad': 'False', 'full_op_name': 'Tensor.add_0.0.backward.input.1', 'state': 'input'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], 'data_name': '-1', 'md5': '00000000', 'Max': 0.33033010363578796, 'Min': -0.331031858921051, 'Mean': -0.030964046716690063, - 'Norm': 2.2533628940582275, 'requires_grad': True, 'full_op_name': 'Tensor.add_0.0.backward.output.0', 'state': 'output'}] + 'Norm': 2.2533628940582275, 'requires_grad': 'True', 'full_op_name': 'Tensor.add_0.0.backward.output.0', 'state': 'output'}] # test_op_item_parse parse_item = [ @@ -76,7 +76,7 @@ parse_index = None parse_item_list = None parse_top_bool = True o_result_parse = [ - {'Max': 4097.0, 'Mean': 820.2, 'Min': 0.0, 'Norm': 4097.0, 'dtype': 'torch.int64', 'requires_grad': False, + {'Max': 4097.0, 'Mean': 820.2, 'Min': 0.0, 'Norm': 4097.0, 'dtype': 'torch.int64', 'requires_grad': 'False', 'shape': [5], 'type': 'torch.Tensor', 'full_op_name': 'Distributed.broadcast.0.forward.input.0', 'data_name': '-1', 'md5': '00000000', 'state': 'input'}, {'full_op_name': 'Distributed.broadcast.0.forward.input.1', 'dtype': "", 'shape': '[]', diff --git a/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py b/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py index 5e59ac9b03..7e99de16b9 100644 --- a/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py +++ b/debug/accuracy_tools/msprobe/visualization/builder/msprobe_adapter.py @@ -54,7 +54,13 @@ def run_real_data(dump_path_param, csv_path, framework, is_cross_frame=False): framework: 框架类型, pytorch或mindspore is_cross_frame: 是否进行跨框架比对,仅支持mindspore比pytorch, 其中pytorch为标杆 """ - mode_config = ModeConfig(stack_mode=False, auto_analyze=True, fuzzy_match=False, dump_mode=Const.ALL) + config_dict = { + 'stack_mode': False, + 'auto_analyze': True, + 'fuzzy_match': False, + 'dump_mode': Const.ALL + } + mode_config = ModeConfig(**config_dict) if framework == Const.PT_FRAMEWORK: from msprobe.pytorch.compare.pt_compare import read_real_data -- Gitee From 3c4e79d966effa0774f7763f01aa97af20b1169a Mon Sep 17 00:00:00 2001 From: i-robot Date: Fri, 18 Jul 2025 01:42:48 +0000 Subject: [PATCH 3/4] compare logger poc sync --- .../core/compare/multiprocessing_compute.py | 35 ++++++++++++------- .../docs/10.accuracy_compare_PyTorch.md | 21 ++++++----- .../test_cmp_multiprocessing_compute.py | 8 ++--- 3 files changed, 38 insertions(+), 26 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index cb0e13e383..19c66e83e9 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -157,20 +157,23 @@ class CompareRealData: 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - error_file, relative_err, error_flag = None, None, False + relative_err, error_flag, err_msg = None, False, None data_name_pair = op_name_mapping_dict.get(npu_op_name) npu_data_name = data_name_pair[0] bench_data_name = data_name_pair[1] + error_file = data_name_pair + if str(npu_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有npu真实数据 - n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True + err_msg = "NPU does not have data file." elif str(bench_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有bench真实数据 - n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True - error_file = 'no_bench_data' + n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True + err_msg = "Bench does not have data file." elif str(bench_data_name) == CompareConst.N_A: # bench没匹配 - n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True - error_file = None + n_value, b_value, error_flag = CompareConst.API_UNMATCH, CompareConst.API_UNMATCH, True + err_msg = "Bench api/module unmatched." else: npu_dir = input_param.get(CompareConst.NPU_DUMP_DATA_DIR) bench_dir = input_param.get(CompareConst.BENCH_DUMP_DATA_DIR) @@ -187,8 +190,9 @@ class CompareRealData: error_flag = True # 通过n_value, b_value同时得到错误标志和错误信息 - n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, - error_flag=error_flag, error_file=error_file) + if not err_msg: + n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, error_flag=error_flag, + error_file=error_file) result_list, err_msg = compare_ops_apply(n_value, b_value, error_flag, err_msg) @@ -218,11 +222,16 @@ class CompareRealData: = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param) if is_print_compare_log: - logger.info( - "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ - one_thousand_err_ratio {}, " - "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, - err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) + if "does not have data file" in err_msg: + logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") + elif "Bench api/module unmatched" in err_msg: + logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") + else: + logger.info( + "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ + one_thousand_err_ratio {}, " + "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, + err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) cos_result.append(cos_sim) euc_dist_result.append(euc_dist) max_err_result.append(max_abs_err) diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index 18b78aa212..b4a0eb6ae4 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -271,15 +271,18 @@ MD5 模式: ### 3.5 错误信息提示(Err_message)——真实数据模式、统计数据模式 1. "Need double check api accuracy.":四个统计值中至少 1 个相对误差 > 0.5(统计数据模式); -2. "Fuzzy matching data, the comparison arruracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); -3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据不存在或者读取出错(真实数据模式); -4. "No bench data matched.":Bench 的 API 没有匹配上、Bench 真实数据不存在或读取出错(真实数据模式); -5. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); -6. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); -7. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); -8. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); -9. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); -10. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 +2. "Fuzzy matching data, the comparison accuracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); +3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据者读取出错(真实数据模式); +4. "No bench data matched.":Bench 的 API 没有匹配上(真实数据模式,统计数据模式); +5. "NPU does not have data file.": NPU的真实数据不存在(真实数据模式); +6. "Bench does not have data file.": Bench的真实数据不存在(真实数据模式); +7. "Bench api/module unmatched.":Bench 的 API 没有匹配上(真实数据模式); +8. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); +9. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); +10. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); +11. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); +12. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); +13. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 除以上错误信息提示外,异常数据颜色高亮标记的原因叠加呈现于此列。 diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index c30c0c16de..afcdd25744 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -23,7 +23,7 @@ o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.i 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, - 'None', 'Dump file: None not found or read failed.', ['-1', '-1']]] + 'None', 'NPU does not have data file.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -160,7 +160,7 @@ class TestCompareRealData(unittest.TestCase): input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'Dump file: None not found or read failed.']) + 'unsupported', 'NPU does not have data file.']) pt_name = 'Functional.linear.0.forward.input.0.pt' op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} @@ -186,7 +186,7 @@ class TestCompareRealData(unittest.TestCase): result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'Dump file: None not found or read failed.']) + 'unsupported', 'NPU does not have data file.']) def test_compare_ops(self): generate_dump_json(base_dir3) @@ -221,7 +221,7 @@ class TestCompareRealData(unittest.TestCase): o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'Dump file: None not found or read failed.', ['-1', '-1']]] + 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'NPU does not have data file.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) -- Gitee From 630be8783af37418faf150aa8013a920e2ccc4e4 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 4 Aug 2025 09:22:56 +0800 Subject: [PATCH 4/4] Revert "compare logger poc sync" This reverts commit 3c4e79d966effa0774f7763f01aa97af20b1169a. --- .../core/compare/multiprocessing_compute.py | 35 +++++++------------ .../docs/10.accuracy_compare_PyTorch.md | 21 +++++------ .../test_cmp_multiprocessing_compute.py | 8 ++--- 3 files changed, 26 insertions(+), 38 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index 19c66e83e9..cb0e13e383 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -157,23 +157,20 @@ class CompareRealData: 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - relative_err, error_flag, err_msg = None, False, None + error_file, relative_err, error_flag = None, None, False data_name_pair = op_name_mapping_dict.get(npu_op_name) npu_data_name = data_name_pair[0] bench_data_name = data_name_pair[1] - error_file = data_name_pair - if str(npu_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有npu真实数据 - n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True - err_msg = "NPU does not have data file." + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True elif str(bench_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有bench真实数据 - n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True - err_msg = "Bench does not have data file." + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + error_file = 'no_bench_data' elif str(bench_data_name) == CompareConst.N_A: # bench没匹配 - n_value, b_value, error_flag = CompareConst.API_UNMATCH, CompareConst.API_UNMATCH, True - err_msg = "Bench api/module unmatched." + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + error_file = None else: npu_dir = input_param.get(CompareConst.NPU_DUMP_DATA_DIR) bench_dir = input_param.get(CompareConst.BENCH_DUMP_DATA_DIR) @@ -190,9 +187,8 @@ class CompareRealData: error_flag = True # 通过n_value, b_value同时得到错误标志和错误信息 - if not err_msg: - n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, error_flag=error_flag, - error_file=error_file) + n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, + error_flag=error_flag, error_file=error_file) result_list, err_msg = compare_ops_apply(n_value, b_value, error_flag, err_msg) @@ -222,16 +218,11 @@ class CompareRealData: = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param) if is_print_compare_log: - if "does not have data file" in err_msg: - logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") - elif "Bench api/module unmatched" in err_msg: - logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") - else: - logger.info( - "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ - one_thousand_err_ratio {}, " - "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, - err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) + logger.info( + "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ + one_thousand_err_ratio {}, " + "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, + err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) cos_result.append(cos_sim) euc_dist_result.append(euc_dist) max_err_result.append(max_abs_err) diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index b4a0eb6ae4..18b78aa212 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -271,18 +271,15 @@ MD5 模式: ### 3.5 错误信息提示(Err_message)——真实数据模式、统计数据模式 1. "Need double check api accuracy.":四个统计值中至少 1 个相对误差 > 0.5(统计数据模式); -2. "Fuzzy matching data, the comparison accuracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); -3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据者读取出错(真实数据模式); -4. "No bench data matched.":Bench 的 API 没有匹配上(真实数据模式,统计数据模式); -5. "NPU does not have data file.": NPU的真实数据不存在(真实数据模式); -6. "Bench does not have data file.": Bench的真实数据不存在(真实数据模式); -7. "Bench api/module unmatched.":Bench 的 API 没有匹配上(真实数据模式); -8. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); -9. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); -10. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); -11. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); -12. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); -13. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 +2. "Fuzzy matching data, the comparison arruracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); +3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据不存在或者读取出错(真实数据模式); +4. "No bench data matched.":Bench 的 API 没有匹配上、Bench 真实数据不存在或读取出错(真实数据模式); +5. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); +6. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); +7. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); +8. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); +9. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); +10. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 除以上错误信息提示外,异常数据颜色高亮标记的原因叠加呈现于此列。 diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index afcdd25744..c30c0c16de 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -23,7 +23,7 @@ o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.i 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, - 'None', 'NPU does not have data file.', ['-1', '-1']]] + 'None', 'Dump file: None not found or read failed.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -160,7 +160,7 @@ class TestCompareRealData(unittest.TestCase): input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'NPU does not have data file.']) + 'unsupported', 'Dump file: None not found or read failed.']) pt_name = 'Functional.linear.0.forward.input.0.pt' op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} @@ -186,7 +186,7 @@ class TestCompareRealData(unittest.TestCase): result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'NPU does not have data file.']) + 'unsupported', 'Dump file: None not found or read failed.']) def test_compare_ops(self): generate_dump_json(base_dir3) @@ -221,7 +221,7 @@ class TestCompareRealData(unittest.TestCase): o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'NPU does not have data file.', ['-1', '-1']]] + 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'Dump file: None not found or read failed.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) -- Gitee