From 026219966f282f881b105854273a70a1c85080ed Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 17 Jul 2025 14:59:10 +0800 Subject: [PATCH 1/3] compare no real data err_msg and log improve compare no real data err_msg and log improve --- .../msprobe/core/common/const.py | 3 ++ .../core/compare/multiprocessing_compute.py | 33 +++++++++++-------- .../msprobe/core/compare/npy_compare.py | 10 +++--- .../test_cmp_multiprocessing_compute.py | 10 +++--- 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index a86b87ce0d..49df80b33c 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -557,6 +557,8 @@ class CompareConst: ULP_FLOAT16_THRESHOLD = 1 # compare result data + NO_REAL_DATA = 'No real data' + API_UNMATCH = 'api unmatched' READ_NONE = 'No data' NONE = 'None' SHAPE_UNMATCH = 'shape unmatched' @@ -589,6 +591,7 @@ class CompareConst: # error message NO_BENCH = "No bench data matched." + # compare const FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble] diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index cb0e13e383..eb3a6617ad 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -157,20 +157,21 @@ class CompareRealData: 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - error_file, relative_err, error_flag = None, None, False + error_file, relative_err, error_flag, err_msg = None, None, False, None data_name_pair = op_name_mapping_dict.get(npu_op_name) npu_data_name = data_name_pair[0] bench_data_name = data_name_pair[1] if str(npu_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有npu真实数据 - n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True + err_msg = "NPU does not have data file." elif str(bench_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有bench真实数据 - n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True - error_file = 'no_bench_data' + n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True + err_msg = "Bench does not have data file." elif str(bench_data_name) == CompareConst.N_A: # bench没匹配 - n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True - error_file = None + n_value, b_value, error_flag = CompareConst.API_UNMATCH, CompareConst.API_UNMATCH, True + err_msg = "Bench api/module unmatched." else: npu_dir = input_param.get(CompareConst.NPU_DUMP_DATA_DIR) bench_dir = input_param.get(CompareConst.BENCH_DUMP_DATA_DIR) @@ -187,8 +188,9 @@ class CompareRealData: error_flag = True # 通过n_value, b_value同时得到错误标志和错误信息 - n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, - error_flag=error_flag, error_file=error_file) + if not err_msg: + n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, error_flag=error_flag, + error_file=error_file) result_list, err_msg = compare_ops_apply(n_value, b_value, error_flag, err_msg) @@ -218,11 +220,16 @@ class CompareRealData: = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param) if is_print_compare_log: - logger.info( - "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ - one_thousand_err_ratio {}, " - "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, - err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) + if "does not have data file" in err_msg: + logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") + elif "Bench api/module unmatched" in err_msg: + logger.info(f"[{npu_op_name}] Compare result: {err_msg} ") + else: + logger.info( + "[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, \ + one_thousand_err_ratio {}, " + "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, + err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) cos_result.append(cos_sim) euc_dist_result.append(euc_dist) max_err_result.append(max_abs_err) diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index b6b27b1772..b62a2338be 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -56,13 +56,10 @@ def get_error_flag_and_msg(n_value, b_value, error_flag=False, error_file=None): """判断数据是否有异常并返回异常的n_value, b_value,同时返回error_flag和error_msg""" err_msg = "" if error_flag: - if error_file == "no_bench_data": - err_msg = "Bench does not have data file." - elif error_file: + if error_file: err_msg = f"Dump file: {error_file} not found or read failed." else: err_msg = CompareConst.NO_BENCH - error_flag = True return CompareConst.READ_NONE, CompareConst.READ_NONE, error_flag, err_msg if n_value.size == 0: # 判断读取到的数据是否为空 @@ -290,7 +287,8 @@ class CompareOps: def error_value_process(n_value): - if n_value in [CompareConst.READ_NONE, CompareConst.UNREADABLE, CompareConst.NONE]: + if n_value in [CompareConst.READ_NONE, CompareConst.UNREADABLE, CompareConst.NONE, + CompareConst.NO_REAL_DATA, CompareConst.API_UNMATCH]: return CompareConst.UNSUPPORTED, "" if n_value == CompareConst.SHAPE_UNMATCH: return CompareConst.SHAPE_UNMATCH, "" @@ -304,7 +302,7 @@ def compare_ops_apply(n_value, b_value, error_flag, err_msg): if error_flag: result, msg = error_value_process(n_value) result_list = [result] * len(CompareOps.compare_ops) - err_msg += msg * len(CompareOps.compare_ops) + err_msg += msg return result_list, err_msg relative_err = get_relative_err(n_value, b_value) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index d95abf6d91..afcdd25744 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -23,7 +23,7 @@ o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.i 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, - 'None', 'No bench data matched.', ['-1', '-1']]] + 'None', 'NPU does not have data file.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -160,7 +160,7 @@ class TestCompareRealData(unittest.TestCase): input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'No bench data matched.']) + 'unsupported', 'NPU does not have data file.']) pt_name = 'Functional.linear.0.forward.input.0.pt' op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} @@ -173,7 +173,7 @@ class TestCompareRealData(unittest.TestCase): result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, '']) - def test_compare_by_op_bench_na(self): + def test_compare_by_op_bench_no_npu_real_data(self): npu_op_name = 'Functional.linear.0.forward.input.0' bench_op_name = 'N/A' op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [-1, -1]} @@ -186,7 +186,7 @@ class TestCompareRealData(unittest.TestCase): result = compare_real_data.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'No bench data matched.']) + 'unsupported', 'NPU does not have data file.']) def test_compare_ops(self): generate_dump_json(base_dir3) @@ -221,7 +221,7 @@ class TestCompareRealData(unittest.TestCase): o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', ['-1', '-1']]] + 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'NPU does not have data file.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) -- Gitee From 289be1df3df5f5c6a181f9223f6eb9615280a08f Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 17 Jul 2025 16:10:03 +0800 Subject: [PATCH 2/3] compare no real data err_msg and log improve --- .../core/compare/multiprocessing_compute.py | 4 +++- .../msprobe/core/compare/npy_compare.py | 5 +---- .../docs/10.accuracy_compare_PyTorch.md | 21 +++++++++++-------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index eb3a6617ad..19c66e83e9 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -157,12 +157,14 @@ class CompareRealData: 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - error_file, relative_err, error_flag, err_msg = None, None, False, None + relative_err, error_flag, err_msg = None, False, None data_name_pair = op_name_mapping_dict.get(npu_op_name) npu_data_name = data_name_pair[0] bench_data_name = data_name_pair[1] + error_file = data_name_pair + if str(npu_data_name) == CompareConst.NO_REAL_DATA_FLAG: # 没有npu真实数据 n_value, b_value, error_flag = CompareConst.NO_REAL_DATA, CompareConst.NO_REAL_DATA, True err_msg = "NPU does not have data file." diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index b62a2338be..2b17c4a96a 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -56,10 +56,7 @@ def get_error_flag_and_msg(n_value, b_value, error_flag=False, error_file=None): """判断数据是否有异常并返回异常的n_value, b_value,同时返回error_flag和error_msg""" err_msg = "" if error_flag: - if error_file: - err_msg = f"Dump file: {error_file} not found or read failed." - else: - err_msg = CompareConst.NO_BENCH + err_msg = f"Dump file: {error_file} not found or read failed." return CompareConst.READ_NONE, CompareConst.READ_NONE, error_flag, err_msg if n_value.size == 0: # 判断读取到的数据是否为空 diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index 6727e01fe0..7a2eca853c 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -268,15 +268,18 @@ MD5 模式: ### 3.5 错误信息提示(Err_message)——真实数据模式、统计数据模式 1. "Need double check api accuracy.":四个统计值中至少 1 个相对误差 > 0.5(统计数据模式); -2. "Fuzzy matching data, the comparison arruracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); -3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据不存在或者读取出错(真实数据模式); -4. "No bench data matched.":Bench 的 API 没有匹配上、Bench 真实数据不存在或读取出错(真实数据模式); -5. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); -6. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); -7. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); -8. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); -9. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); -10. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 +2. "Fuzzy matching data, the comparison accuracy may be affected.":NPU 或 Bench 的真实数据名没有匹配上(真实数据模式); +3. "Dump file: {} not found or read failed.":NPU 或 Bench 的真实数据者读取出错(真实数据模式); +4. "No bench data matched.":Bench 的 API 没有匹配上(真实数据模式,统计数据模式); +5. "NPU does not have data file.": NPU的真实数据不存在(真实数据模式); +6. "Bench does not have data file.": Bench的真实数据不存在(真实数据模式); +7. "Bench api/module unmatched.":Bench 的 API 没有匹配上(真实数据模式); +8. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); +9. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); +10. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); +11. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); +12. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); +13. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 除以上错误信息提示外,异常数据颜色高亮标记的原因叠加呈现于此列。 -- Gitee From 7bf7db92b3a87ee2d702036ee312698706eb5bf0 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 17 Jul 2025 16:19:51 +0800 Subject: [PATCH 3/3] compare no real data err_msg and log improve --- .../test/core_ut/compare/test_acc_compare_npy_compare.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py index a30d693f7b..417ea2b9f0 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py @@ -85,13 +85,14 @@ class TestUtilsMethods(unittest.TestCase): n_value = np.array([1, 2, np.inf, 4]) b_value = np.array([1, 2, 3, 4]) error_flag = True + error_file = 'fake file' - n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, error_flag=error_flag) + n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, error_flag=error_flag, error_file=error_file) self.assertEqual(n_value, CompareConst.READ_NONE) self.assertEqual(b_value, CompareConst.READ_NONE) self.assertTrue(error_flag) - self.assertEqual(err_msg, CompareConst.NO_BENCH) + self.assertEqual(err_msg, "Dump file: fake file not found or read failed.") def test_get_error_flag_and_msg_none(self): n_value = np.array([]) -- Gitee