From affdd2edef5f8c5000583d316fe4b93a75a19a35 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 20 Feb 2025 15:46:26 +0800 Subject: [PATCH 01/25] compare add euclidean distance --- .../msprobe/core/common/const.py | 8 +- .../msprobe/core/compare/acc_compare.py | 11 ++- .../msprobe/core/compare/highlight.py | 13 ++- .../core/compare/multiprocessing_compute.py | 2 + .../msprobe/core/compare/npy_compare.py | 51 ++++++++++++ .../msprobe/core/compare/utils.py | 8 +- .../compare/test_acc_compare_npy_compare.py | 80 ++++++++++++++++++- 7 files changed, 160 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index d9623b80712..bf0883667c0 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -256,6 +256,7 @@ class CompareConst: MEAN_DIFF = "Mean diff" NORM_DIFF = "L2norm diff" COSINE = "Cosine" + EUC_DIST = "EucDist" MAX_ABS_ERR = "MaxAbsErr" MAX_RELATIVE_ERR = "MaxRelativeErr" MIN_RELATIVE_ERR = "MinRelativeErr" @@ -330,8 +331,8 @@ class CompareConst: ULP_ERR_STATUS = "ulp_err_status" COMPARE_RESULT_HEADER = [ - NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, COSINE, MAX_ABS_ERR, MAX_RELATIVE_ERR, - ONE_THOUSANDTH_ERR_RATIO, FIVE_THOUSANDTHS_ERR_RATIO, + NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, COSINE, EUC_DIST, + MAX_ABS_ERR, MAX_RELATIVE_ERR, ONE_THOUSANDTH_ERR_RATIO, FIVE_THOUSANDTHS_ERR_RATIO, NPU_MAX, NPU_MIN, NPU_MEAN, NPU_NORM, BENCH_MAX, BENCH_MIN, BENCH_MEAN, BENCH_NORM, ACCURACY, ERROR_MESSAGE ] @@ -357,7 +358,8 @@ class CompareConst: Const.MD5: MD5_COMPARE_RESULT_HEADER } - ALL_COMPARE_INDEX = [COSINE, MAX_ABS_ERR, MAX_RELATIVE_ERR, ONE_THOUSANDTH_ERR_RATIO, FIVE_THOUSANDTHS_ERR_RATIO] + ALL_COMPARE_INDEX = [COSINE, EUC_DIST, MAX_ABS_ERR, MAX_RELATIVE_ERR, ONE_THOUSANDTH_ERR_RATIO, + FIVE_THOUSANDTHS_ERR_RATIO] SUMMARY_COMPARE_INDEX = [MAX_DIFF, MIN_DIFF, MEAN_DIFF, NORM_DIFF, MAX_RELATIVE_ERR, MIN_RELATIVE_ERR, MEAN_RELATIVE_ERR, NORM_RELATIVE_ERR] diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 55229d72657..0672e32404c 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -456,6 +456,7 @@ class Comparator: def compare_ops(self, idx, dump_path_dict, result_df, lock, input_param): cos_result = [] + euc_dist_result = [] max_err_result = [] max_relative_err_result = [] err_mess = [] @@ -469,8 +470,8 @@ class Comparator: if is_print_compare_log: logger.info("start compare: {}".format(npu_op_name)) - cos_sim, max_abs_err, max_relative_err, one_thousand_err_ratio, five_thousand_err_ratio, err_msg = \ - self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param, bench_data) + cos_sim, euc_dist, max_abs_err, max_relative_err, one_thousand_err_ratio, five_thousand_err_ratio, err_msg \ + = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param, bench_data) if is_print_compare_log: logger.info( @@ -479,6 +480,7 @@ class Comparator: "five_thousand_err_ratio {}".format(npu_op_name, cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) cos_result.append(cos_sim) + euc_dist_result.append(euc_dist) max_err_result.append(max_abs_err) max_relative_err_result.append(max_relative_err) err_mess.append(err_msg) @@ -487,6 +489,7 @@ class Comparator: cr = ComparisonResult( cos_result=cos_result, + euc_dist_result=euc_dist_result, max_err_result=max_err_result, max_relative_err_result=max_relative_err_result, err_msgs=err_mess, @@ -496,9 +499,9 @@ class Comparator: return _save_cmp_result(idx, cr, result_df, lock) - def do_multi_process(self, input_parma, result_df): + def do_multi_process(self, input_param, result_df): try: - result_df = _handle_multi_process(self.compare_ops, input_parma, result_df, + result_df = _handle_multi_process(self.compare_ops, input_param, result_df, multiprocessing.Manager().RLock()) return result_df except ValueError as e: diff --git a/debug/accuracy_tools/msprobe/core/compare/highlight.py b/debug/accuracy_tools/msprobe/core/compare/highlight.py index cf3e1c4c03e..d95729c6d85 100644 --- a/debug/accuracy_tools/msprobe/core/compare/highlight.py +++ b/debug/accuracy_tools/msprobe/core/compare/highlight.py @@ -17,6 +17,7 @@ import abc import math import multiprocessing import re +from abc import ABC from collections import namedtuple import numpy as np @@ -97,6 +98,13 @@ class CheckCosineSimilarity(HighlightCheck): "compared to the input/parameters's") +class CheckEuclideanDistance(HighlightCheck): + """检查欧式距离""" + + def apply(self, info, color_columns, dump_mode): + pass + + class CheckMaxRelativeDiff(HighlightCheck): """检查最大相对差异""" @@ -146,11 +154,14 @@ class HighlightRules: } # 用于比较输入和输出的规则 + # 真实数据检查规则 compare_rules = { "check_order_magnitude": CheckOrderMagnitude(), "check_one_thousand_error": CheckOneThousandErrorRatio(), - "check_cosine_similarity": CheckCosineSimilarity() + "check_cosine_similarity": CheckCosineSimilarity(), + "check_euclidean_distance": CheckEuclideanDistance() } + # 统计量数据检查规则 summary_compare_rules = { "check_order_magnitude": CheckOrderMagnitude(), "check_max_relative_diff": CheckMaxRelativeDiff(), diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index c2c1461e452..560a6f603eb 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -110,6 +110,7 @@ def read_dump_data(result_df): @dataclass class ComparisonResult: cos_result: list + euc_dist_result: list max_err_result: list max_relative_err_result: list err_msgs: list @@ -135,6 +136,7 @@ def _save_cmp_result(offset, result: ComparisonResult, result_df, lock): for i, _ in enumerate(result.cos_result): process_index = i + offset result_df.loc[process_index, CompareConst.COSINE] = result.cos_result[i] + result_df.loc[process_index, CompareConst.EUC_DIST] = result.euc_dist_result[i] result_df.loc[process_index, CompareConst.MAX_ABS_ERR] = result.max_err_result[i] result_df.loc[process_index, CompareConst.MAX_RELATIVE_ERR] = result.max_relative_err_result[i] result_df.loc[process_index, CompareConst.ERROR_MESSAGE] = result.err_msgs[i] diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index c551985780c..db391726c79 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -168,6 +168,7 @@ def statistics_data_check(result_dict): class TensorComparisonBasic(abc.ABC): """NPU和bench中npy数据的比较模板""" + @abc.abstractmethod def apply(self, n_value, b_value, relative_err): raise NotImplementedError @@ -190,6 +191,7 @@ def get_relative_err(n_value, b_value): class GetCosineSimilarity(TensorComparisonBasic): """计算cosine相似度""" + @staticmethod def correct_data(result): if result == CompareConst.NAN: @@ -224,8 +226,54 @@ class GetCosineSimilarity(TensorComparisonBasic): return result, "" +class GetEuclideanDistance(TensorComparisonBasic): + """计算欧式距离""" + + def apply(self, n_value, b_value, relative_err): + msg = '' + + # 检查输入维度是否一致 + if n_value.shape != b_value.shape: + msg = f"Cannot compare by Euclidean Distance, shapes of tensors do not match: \ + npu:{n_value.shape} vs bench:{b_value.shape}" + return CompareConst.UNSUPPORTED, msg + + # 检查输入是否为空 + if n_value.size == 0 or b_value.size == 0: + msg = f"Cannot compare by Euclidean Distance, sizes of tensors must not be empty: \ + npu:{n_value.size} vs bench:{b_value.size}" + return CompareConst.NAN, msg + + # 检查是否包含 NaN 或 Inf + if np.any(np.isnan(n_value)) or np.any(np.isnan(b_value)): + msg = "Tensor contains NaN values." + return CompareConst.NAN, msg + if np.any(np.isinf(n_value)) or np.any(np.isinf(b_value)): + msg = "Tensor contains Inf values." + return CompareConst.NAN, msg + + # 处理零向量 + if np.all(n_value == 0) and np.all(b_value == 0): + return 0.0, "Zero tensors" + + # 输入为标量 + if np.ndim(n_value) == 0 or np.ndim(b_value) == 0: + msg = "Cannot compare by Euclidean Distance, input must be a vector, not a scalar." + return CompareConst.UNSUPPORTED, msg + + # 大数值溢出 + if np.any(np.abs(n_value) > 1e10) or np.any(np.abs(b_value) > 1e10): + msg = "tensors's values are large, which may cause overflow." + + # 计算欧式距离 + distance = np.linalg.norm(n_value - b_value) + + return distance, msg + + class GetMaxAbsErr(TensorComparisonBasic): """计算最大绝对误差""" + def apply(self, n_value, b_value, relative_err): temp_res = n_value - b_value max_value = np.max(np.abs(temp_res)) @@ -237,6 +285,7 @@ class GetMaxAbsErr(TensorComparisonBasic): class GetMaxRelativeErr(TensorComparisonBasic): """计算最大相对误差""" + def apply(self, n_value, b_value, relative_err): max_relative_err = np.max(np.abs(relative_err)) if np.isnan(max_relative_err): @@ -247,6 +296,7 @@ class GetMaxRelativeErr(TensorComparisonBasic): class GetErrRatio(TensorComparisonBasic): """计算相对误差小于指定阈值(千分之一、千分之五)的比例""" + def __init__(self, threshold): self.threshold = threshold @@ -264,6 +314,7 @@ class GetErrRatio(TensorComparisonBasic): class CompareOps: compare_ops = { "cosine_similarity": GetCosineSimilarity(), + "euclidean_distance": GetEuclideanDistance(), "max_abs_error": GetMaxAbsErr(), "max_relative_error": GetMaxRelativeErr(), "one_thousand_err_ratio": GetErrRatio(CompareConst.THOUSAND_RATIO_THRESHOLD), diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index a2edf57e5bb..72b75ab254e 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -285,9 +285,9 @@ def result_item_init(n_info, b_info, dump_mode): md5_compare_result = CompareConst.PASS if n_info.struct[2] == b_info.struct[2] else CompareConst.DIFF result_item.extend([n_info.struct[2], b_info.struct[2], md5_compare_result]) elif dump_mode == Const.SUMMARY: - result_item.extend([" "] * 8) + result_item.extend([" "] * 8) # 8个统计量数据情况的比对指标 else: - result_item.extend([" "] * 5) + result_item.extend([" "] * 6) # 6个真实数据情况的比对指标 else: err_msg = "index out of bounds error will occur in result_item_init, please check!\n" \ f"npu_info_struct is {n_info.struct}\n" \ @@ -453,9 +453,9 @@ def get_un_match_accuracy(result, n_dict, dump_mode): result.append(result_item) continue if dump_mode == Const.SUMMARY: - result_item.extend([CompareConst.N_A] * 8) + result_item.extend([CompareConst.N_A] * 8) # 8个统计量数据情况的比对指标 if dump_mode == Const.ALL: - result_item.extend([CompareConst.N_A] * 5) + result_item.extend([CompareConst.N_A] * 6) # 6个真实数据情况的比对指标 npu_summary_data = safe_get_value(summary_reorder, index, "summary_reorder") bench_summary_data = [CompareConst.N_A] * 4 diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py index aec6cdc5117..cee6d5565bf 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py @@ -20,7 +20,7 @@ from unittest.mock import patch from msprobe.core.common.const import CompareConst from msprobe.core.compare.npy_compare import handle_inf_nan, reshape_value, get_error_flag_and_msg, \ npy_data_check, statistics_data_check, get_relative_err, GetCosineSimilarity, GetMaxAbsErr, GetMaxRelativeErr, \ - GetErrRatio, error_value_process, compare_ops_apply + GetErrRatio, error_value_process, compare_ops_apply, GetEuclideanDistance op_name = 'Functional.conv2d.0.backward.input.0' @@ -473,3 +473,81 @@ class TestUtilsMethods(unittest.TestCase): a, b = compare_ops_apply(n_value, b_value, error_flag, err_msg) self.assertEqual(a, [1.0, 0.0, 0.0, 1.0, 1.0]) self.assertEqual(b, '') + + +class TestGetEuclideanDistance(unittest.TestCase): + + def setUp(self): + self.euc_distance = GetEuclideanDistance() + + def test_shape_mismatch(self): + # 测试当两个张量的形状不匹配时,返回 UNSUPPORTED + n_value = np.array([1, 2, 3]) + b_value = np.array([1, 2]) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + self.assertEqual(result, CompareConst.UNSUPPORTED) + self.assertIn("Cannot compare by Euclidean Distance", msg) + + def test_empty_tensor(self): + # 测试当输入的张量为空时,返回 NAN + n_value = np.array([]) + b_value = np.array([1, 2, 3]) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + self.assertEqual(result, CompareConst.NAN) + self.assertIn("sizes of tensors must not be empty", msg) + + def test_nan_in_tensor(self): + # 测试当张量包含 NaN 值时,返回 NAN + n_value = np.array([1, 2, np.nan]) + b_value = np.array([1, 2, 3]) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + self.assertEqual(result, CompareConst.NAN) + self.assertIn("Tensor contains NaN values", msg) + + def test_inf_in_tensor(self): + # 测试当张量包含 Inf 值时,返回 NAN + n_value = np.array([1, 2, np.inf]) + b_value = np.array([1, 2, 3]) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + self.assertEqual(result, CompareConst.NAN) + self.assertIn("Tensor contains Inf values", msg) + + def test_zero_tensors(self): + # 测试两个零张量的欧式距离 + n_value = np.array([0, 0, 0]) + b_value = np.array([0, 0, 0]) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + self.assertEqual(result, 0.0) + self.assertIn("Zero tensors", msg) + + def test_scalars(self): + # 测试当输入是标量时,返回 UNSUPPORTED + n_value = np.array(5) + b_value = np.array(10) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + self.assertEqual(result, CompareConst.UNSUPPORTED) + self.assertIn("input must be a vector, not a scalar", msg) + + def test_large_values(self): + # 测试当张量包含大值时,应该返回大数值溢出的警告 + n_value = np.array([1e11, 1e11, 1e11]) + b_value = np.array([1e10, 1e10, 1e10]) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + self.assertIn("tensors's values are large", msg) + + def test_euclidean_distance(self): + # 测试计算两个张量之间的欧式距离 + n_value = np.array([1, 2, 3]) + b_value = np.array([4, 5, 6]) + + result, msg = self.euc_distance.apply(n_value, b_value, None) + expected_distance = np.linalg.norm(n_value - b_value) + self.assertEqual(result, expected_distance) + self.assertEqual(msg, '') -- Gitee From 820bd952616ff9b6500f436915a1b2b8f03b1334 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 24 Feb 2025 10:58:33 +0800 Subject: [PATCH 02/25] compare add euclidean distance --- .../msprobe/core/compare/acc_compare.py | 9 ++-- .../core/compare/multiprocessing_compute.py | 10 ++--- .../test/core_ut/compare/test_acc_compare.py | 38 +++++++++------- .../compare/test_acc_compare_npy_compare.py | 4 +- .../core_ut/compare/test_acc_compare_utils.py | 44 +++++++++++-------- .../test_cmp_multiprocessing_compute.py | 20 +++++---- 6 files changed, 69 insertions(+), 56 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 0672e32404c..4f4bdd0b105 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -459,9 +459,10 @@ class Comparator: euc_dist_result = [] max_err_result = [] max_relative_err_result = [] - err_mess = [] one_thousand_err_ratio_result = [] five_thousand_err_ratio_result = [] + err_mess = [] + is_print_compare_log = input_param.get("is_print_compare_log") bench_data = load_json(input_param.get("bench_json_path")).get('data') for i in range(len(result_df)): @@ -483,18 +484,18 @@ class Comparator: euc_dist_result.append(euc_dist) max_err_result.append(max_abs_err) max_relative_err_result.append(max_relative_err) - err_mess.append(err_msg) one_thousand_err_ratio_result.append(one_thousand_err_ratio) five_thousand_err_ratio_result.append(five_thousand_err_ratio) + err_mess.append(err_msg) cr = ComparisonResult( cos_result=cos_result, euc_dist_result=euc_dist_result, max_err_result=max_err_result, max_relative_err_result=max_relative_err_result, - err_msgs=err_mess, one_thousand_err_ratio_result=one_thousand_err_ratio_result, - five_thousand_err_ratio_result=five_thousand_err_ratio_result + five_thousand_err_ratio_result=five_thousand_err_ratio_result, + err_msgs=err_mess ) return _save_cmp_result(idx, cr, result_df, lock) diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index 560a6f603eb..20849afa920 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -113,9 +113,9 @@ class ComparisonResult: euc_dist_result: list max_err_result: list max_relative_err_result: list - err_msgs: list one_thousand_err_ratio_result: list five_thousand_err_ratio_result: list + err_msgs: list def _save_cmp_result(offset, result: ComparisonResult, result_df, lock): @@ -139,13 +139,13 @@ def _save_cmp_result(offset, result: ComparisonResult, result_df, lock): result_df.loc[process_index, CompareConst.EUC_DIST] = result.euc_dist_result[i] result_df.loc[process_index, CompareConst.MAX_ABS_ERR] = result.max_err_result[i] result_df.loc[process_index, CompareConst.MAX_RELATIVE_ERR] = result.max_relative_err_result[i] - result_df.loc[process_index, CompareConst.ERROR_MESSAGE] = result.err_msgs[i] - result_df.loc[process_index, CompareConst.ACCURACY] = ( - check_accuracy(result.cos_result[i], result.max_err_result[i])) result_df.loc[process_index, CompareConst.ONE_THOUSANDTH_ERR_RATIO] = ( result.one_thousand_err_ratio_result)[i] result_df.loc[process_index, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] = ( result.five_thousand_err_ratio_result)[i] + result_df.loc[process_index, CompareConst.ACCURACY] = ( + check_accuracy(result.cos_result[i], result.max_err_result[i])) + result_df.loc[process_index, CompareConst.ERROR_MESSAGE] = result.err_msgs[i] return result_df except ValueError as e: logger.error('result dataframe is not found.') diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py index b4566fcfe6f..c882e331f55 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py @@ -191,17 +191,21 @@ summary_line_3 = ['Functional_batch_norm_0_forward.output.2', 'Functional_batch_ 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 1, 'Warning', ''] line_input = ['Functional.batch.norm.0.forward.input.0', 'Functional.batch.norm.0.forward.input.0', 'torch.float16', - 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 1, 1, 1, 0.95, 1, 1, 1, 1, 1, 1.01, 1, 1, 1, + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 1, 0.5, 1, 1, 0.95, 1, + 1, 1, 1, 1, 1.01, 1, 1, 1, 'Yes', ''] line_1 = ['Functional.batch.norm.0.forward.output.0', 'Functional.batch.norm.0.forward.output.0', 'torch.float16', - 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 1, 1, 0.59, 1, 'nan', 0, 1, 1, 19, 1, 1, 1, - 'Warning', ''] + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 0.5, 1, 1, 0.59, 1, + 'nan', 0, 1, 1, 19, 1, 1, 1, + 'Yes', ''] line_2 = ['Functional.batch.norm.0.forward.output.1', 'Functional.batch.norm.0.forward.output.1', 'torch.float16', - 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.9, 1, 1, 0.8, 1, 0, 0.12, 0, 1, 1, 0.1, 1, 1, 1, - 'Warning', ''] + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.9, 0.5, 1, 1, 0.8, 1, + 0, 0.12, 0, 1, 1, 0.1, 1, 1, + 'Yes', ''] line_3 = ['Functional.batch.norm.0.forward.output.2', 'Functional.batch.norm.0.forward.output.2', 'torch.float16', - 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 1.1e+10, 1, 0.85, 1, 9, 0.12, 0, 1, 1, 0.1, 1, - 1, 1, 'Warning', ''] + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 0.5, 1.1e+10, 1, 0.85, 1, + 9, 0.12, 0, 1, 1, 0.1, 1, 1, + 'Yes', ''] op_data = { 'input_args': [{'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [16, 1, 3, 3], @@ -363,7 +367,7 @@ class TestUtilsMethods(unittest.TestCase): 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', 'File']] result_all = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', - 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', + 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', 'File', '-1']] columns_md5_stack_mode_true = CompareConst.MD5_COMPARE_RESULT_HEADER + ['NPU_Stack_Info'] result_table_md5_true = pd.DataFrame(result_md5, columns=columns_md5_stack_mode_true, dtype=object) @@ -403,10 +407,10 @@ class TestUtilsMethods(unittest.TestCase): 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '']] result_all_test = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', - 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', + 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '', '-1']] result_all = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', - 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', + 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1']] columns_md5_stack_mode_true = CompareConst.MD5_COMPARE_RESULT_HEADER result_table_md5_true = pd.DataFrame(result_md5, columns=columns_md5_stack_mode_true, dtype='object') @@ -632,10 +636,10 @@ class TestUtilsMethods(unittest.TestCase): def test_do_multi_process(self): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1']] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1']] o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', - 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'unsupported', + 'torch.float32', 'torch.float32', [2, 2], [2, 2], + 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', '-1']] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) @@ -669,7 +673,7 @@ class TestUtilsMethods(unittest.TestCase): result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {}) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'No bench data matched.']) + 'unsupported', 'No bench data matched.']) def test_compare_by_op_2(self): npu_op_name = 'Functional.linear.0.forward.input.0' @@ -691,7 +695,7 @@ class TestUtilsMethods(unittest.TestCase): {'Functional.linear.0.forward': {'input_args': [ {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}}) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - f'Dump file: {pt_path} not found.']) + 'unsupported', f'Dump file: {pt_path} not found.']) pt_name = 'Functional.linear.0.forward.input.0.pt' pt_path = os.path.join(base_dir, pt_name) @@ -699,13 +703,13 @@ class TestUtilsMethods(unittest.TestCase): input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {}) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'Bench does not have data file.']) + 'unsupported', 'Bench does not have data file.']) generate_pt(base_dir) result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {'Functional.linear.0.forward': {'input_args': [ {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}}) - self.assertEqual(result, [1.0, 0.0, 0.0, 1.0, 1.0, '']) + self.assertEqual(result, [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, '']) def test_get_bench_data_name_input(self): bench_op_name = "Functional.linear.0.forward.input.0" diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py index cee6d5565bf..5c10d96cb8d 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py @@ -471,7 +471,7 @@ class TestUtilsMethods(unittest.TestCase): error_flag = False err_msg = '' a, b = compare_ops_apply(n_value, b_value, error_flag, err_msg) - self.assertEqual(a, [1.0, 0.0, 0.0, 1.0, 1.0]) + self.assertEqual(a, [1.0, 0.0, 0.0, 0.0, 1.0, 1.0]) self.assertEqual(b, '') @@ -492,7 +492,7 @@ class TestGetEuclideanDistance(unittest.TestCase): def test_empty_tensor(self): # 测试当输入的张量为空时,返回 NAN n_value = np.array([]) - b_value = np.array([1, 2, 3]) + b_value = np.array([]) result, msg = self.euc_distance.apply(n_value, b_value, None) self.assertEqual(result, CompareConst.NAN) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index ab8703dcd35..2e9a4657266 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -221,28 +221,34 @@ o_result_unmatch_2 = [ 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None'] ] o_result_unmatch_3 = [ - ['Functional.conv2d.0.forward.input.0', 'N/A', 'torch.float32', 'N/A', [1, 1, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', - 'N/A', 'N/A', 3.029174327850342, -2.926689624786377, -0.06619918346405029, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + ['Functional.conv2d.0.forward.input.0', 'N/A', 'torch.float32', 'N/A', [1, 1, 28, 28], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 3.029174327850342, -2.926689624786377, -0.06619918346405029, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], - ['Functional.conv2d.0.forward.input.1', 'N/A', 'torch.float32', 'N/A', [16, 1, 5, 5], 'N/A', 'N/A', 'N/A', 'N/A', - 'N/A', 'N/A', 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + ['Functional.conv2d.0.forward.input.1', 'N/A', 'torch.float32', 'N/A', [16, 1, 5, 5], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], - ['Functional.conv2d.0.forward.input.2', 'N/A', 'torch.float32', 'N/A', [16], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'N/A', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + ['Functional.conv2d.0.forward.input.2', 'N/A', 'torch.float32', 'N/A', [16], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], - ['Functional.conv2d.0.forward.parameters.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', - 'N/A', 'N/A', - 'N/A', 'N/A', 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], - ['Functional.conv2d.0.forward.parameters.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', - 'N/A', - 'N/A', 'N/A', 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], - ['Functional.conv2d.0.forward.output.0', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', - 'N/A', 'N/A', 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + ['Functional.conv2d.0.forward.parameters.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + ['Functional.conv2d.0.forward.parameters.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + ['Functional.conv2d.0.forward.output.0', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], - ['Functional.conv2d.0.parameters_grad.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', - 'N/A', 'N/A', 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], - ['Functional.conv2d.0.parameters_grad.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', - 'N/A', 'N/A', 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'] + ['Functional.conv2d.0.parameters_grad.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + ['Functional.conv2d.0.parameters_grad.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'] ] # test_merge_tensor @@ -558,7 +564,7 @@ class TestUtilsMethods(unittest.TestCase): dump_mode = Const.ALL result_item = result_item_init(n_info, b_info, dump_mode) self.assertEqual(result_item, ['Tensor.add.0.forward.input.0', 'Tensor.add.0.forward.input.0', - 'torch.float32', 'torch.float32', [96], [96], ' ', ' ', ' ', ' ', ' ']) + 'torch.float32', 'torch.float32', [96], [96], ' ', ' ', ' ', ' ', ' ', ' ']) dump_mode = Const.SUMMARY result_item = result_item_init(n_info, b_info, dump_mode) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index 9c2dea835fe..3fa16b0d9d4 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -16,12 +16,12 @@ from test_acc_compare import generate_dump_json data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1']] o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', + 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', '-1']] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] @@ -34,9 +34,9 @@ class TestUtilsMethods(unittest.TestCase): def setUp(self): self.result_df = pd.DataFrame(columns=[ - CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, - CompareConst.ERROR_MESSAGE, CompareConst.ACCURACY, - CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO + CompareConst.COSINE, CompareConst.EUC_DIST, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, + CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO, + CompareConst.ACCURACY, CompareConst.ERROR_MESSAGE ]) os.makedirs(base_dir, mode=0o750, exist_ok=True) self.lock = threading.Lock() @@ -72,9 +72,10 @@ class TestUtilsMethods(unittest.TestCase): cos_result=[0.99, 0.98], max_err_result=[0.01, 0.02], max_relative_err_result=[0.001, 0.002], - err_msgs=['', 'Error in comparison'], + euc_dist_result=[0.5, 0.49], one_thousand_err_ratio_result=[0.1, 0.2], - five_thousand_err_ratio_result=[0.05, 0.1] + five_thousand_err_ratio_result=[0.05, 0.1], + err_msgs=['', 'Error in comparison'] ) offset = 0 updated_df = _save_cmp_result(offset, comparison_result, self.result_df, self.lock) @@ -88,9 +89,10 @@ class TestUtilsMethods(unittest.TestCase): cos_result=[0.99], max_err_result=[], max_relative_err_result=[0.001], - err_msgs=[''], + euc_dist_result=[0.5], one_thousand_err_ratio_result=[0.1], - five_thousand_err_ratio_result=[0.05] + five_thousand_err_ratio_result=[0.05], + err_msgs=[''] ) with self.assertRaises(CompareException) as context: _save_cmp_result(0, comparison_result, self.result_df, self.lock) -- Gitee From 1fc978d5cd78390543e6b092d017b89d1951aaeb Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 24 Feb 2025 11:14:06 +0800 Subject: [PATCH 03/25] compare add euclidean distance --- .../msprobe/core/compare/npy_compare.py | 2 +- .../core_ut/compare/test_cmp_highlight.py | 30 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index db391726c79..7147f4d3dba 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py index f561a3e05ec..3261bce5d6d 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_highlight.py @@ -26,7 +26,7 @@ def generate_result_xlsx(base_dir): data_path = os.path.join(base_dir, 'target_result.xlsx') data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] ] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) @@ -101,8 +101,8 @@ class TestUtilsMethods(unittest.TestCase): self.assertEqual(result, None) def test_CheckOneThousandErrorRatio_str(self): - api_in = [1, 1, 1, 1, 1, 1, 1, 1, 1, "unsupported"] - api_out = [1, 1, 1, 1, 1, 1, 1, 1, 1, "unsupported"] + api_in = [1, 1, 1, 1, 1, 1, 0.9, 0.5, 1, 1, "unsupported"] + api_out = [1, 1, 1, 1, 1, 1, 0.9, 0.5, 1, 1, "unsupported"] info = (api_in, api_out, 1) color_columns = () dump_mode = Const.ALL @@ -113,8 +113,8 @@ class TestUtilsMethods(unittest.TestCase): @patch("msprobe.core.compare.highlight.add_highlight_row_info") def test_CheckOneThousandErrorRatio_red(self, mock_add_highlight_row_info): - api_in = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - api_out = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0.5] + api_in = [1, 1, 1, 1, 1, 1, 0.9, 0.5, 1, 1, 1] + api_out = [1, 1, 1, 1, 1, 1, 0.9, 0.5, 1, 1, 0.5] info = (api_in, api_out, 1) ColorColumns = namedtuple('ColorColumns', ['red', 'yellow']) color_columns = ColorColumns(red=[], yellow=[]) @@ -315,7 +315,7 @@ class TestUtilsMethods(unittest.TestCase): columns = CompareConst.COMPARE_RESULT_HEADER data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', ''] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', ''] ] result_df = pd.DataFrame(data, columns=columns) @@ -329,7 +329,7 @@ class TestUtilsMethods(unittest.TestCase): def test_highlight_rows_xlsx_red(self): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] ] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) @@ -342,7 +342,7 @@ class TestUtilsMethods(unittest.TestCase): def test_highlight_rows_xlsx_yellow(self): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] ] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) @@ -356,7 +356,7 @@ class TestUtilsMethods(unittest.TestCase): def test_highlight_rows_xlsx_malicious_columns(self, mock_save_book): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] ] columns = CompareConst.COMPARE_RESULT_HEADER + ['=Data_name'] result_df = pd.DataFrame(data, columns=columns) @@ -378,10 +378,10 @@ class TestUtilsMethods(unittest.TestCase): def test_highlight_rows_xlsx_malicious_type(self, mock_save_book): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', '=torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'], + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'], ['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', '=torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] ] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) @@ -416,10 +416,10 @@ class TestUtilsMethods(unittest.TestCase): def test_update_highlight_err_msg(self): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'], + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'], ['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1'] ] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) @@ -433,10 +433,10 @@ class TestUtilsMethods(unittest.TestCase): t_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', 'a\nb', '-1'], + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', 'a\nb', '-1'], ['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', 'd', '-1'] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', 'd', '-1'] ] target_result_df = pd.DataFrame(t_data, columns=columns) self.assertTrue(result_df.equals(target_result_df)) -- Gitee From a01d5cb49d528302c01c731c6efc32874aa9d2ac Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 24 Feb 2025 11:26:18 +0800 Subject: [PATCH 04/25] compare add euclidean distance --- debug/accuracy_tools/msprobe/core/compare/acc_compare.py | 4 ++-- debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 4f4bdd0b105..f0ac97a0293 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -311,9 +311,9 @@ class Comparator: ] if self.dump_mode == Const.SUMMARY: - result_item = base_result_item + [" "] * 8 + result_item = base_result_item + [" "] * 8 # 8个统计量数据情况的比对指标 else: - result_item = base_result_item + [" "] * 5 + result_item = base_result_item + [" "] * 6 # 6个真实数据情况的比对指标 npu_summary_data = npu_ops_all.get(ms_op_name).get("summary") result_item.extend(npu_summary_data) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index 8509a7f38ad..de507e87665 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -125,7 +125,8 @@ class MSComparator(Comparator): result_df.loc[warning_flag, CompareConst.RESULT] = CompareConst.WARNING result_df.loc[warning_flag, CompareConst.ERROR_MESSAGE] = 'Need double check api accuracy.' else: - fill_cols = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, + fill_cols = [CompareConst.COSINE, CompareConst.EUC_DIST, + CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO, CompareConst.ERROR_MESSAGE] result_df.loc[~condition_no_bench, fill_cols] = '' -- Gitee From 739fe1cad25c7c7d8aeaf53e773a4d07aaeea429 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 24 Feb 2025 15:25:43 +0800 Subject: [PATCH 05/25] compare add euclidean distance --- debug/accuracy_tools/msprobe/core/common/const.py | 2 +- .../msprobe/mindspore/compare/ms_graph_compare.py | 11 ++++++----- .../mindspore_ut/compare/test_ms_graph_compare.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index bf0883667c0..b60d06d10bf 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -469,7 +469,7 @@ class CompareConst: BENCH_MEAN: None, BENCH_NORM: None, ACCURACY: '', ERROR_MESSAGE: '' } MS_GRAPH_NPY = { - COSINE: None, MAX_ABS_ERR: None, MAX_RELATIVE_ERR: None, ONE_THOUSANDTH_ERR_RATIO: None, + COSINE: None, EUC_DIST: None, MAX_ABS_ERR: None, MAX_RELATIVE_ERR: None, ONE_THOUSANDTH_ERR_RATIO: None, FIVE_THOUSANDTHS_ERR_RATIO: None } MS_GRAPH_STATISTIC = { diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py index 701988ba483..153f4fd6552 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_graph_compare.py @@ -195,11 +195,12 @@ class GraphMSComparator: if not error_flag: result_list, err_msg = compare_ops_apply(n_value, b_value, False, "") result_dict[CompareConst.COSINE] = result_list[0] - result_dict[CompareConst.MAX_ABS_ERR] = result_list[1] - result_dict[CompareConst.MAX_RELATIVE_ERR] = result_list[2] - result_dict[CompareConst.ONE_THOUSANDTH_ERR_RATIO] = result_list[3] - result_dict[CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] = result_list[4] - result_dict[CompareConst.ACCURACY] = check_accuracy(result_list[0], result_list[1]) + result_dict[CompareConst.EUC_DIST] = result_list[1] + result_dict[CompareConst.MAX_ABS_ERR] = result_list[2] + result_dict[CompareConst.MAX_RELATIVE_ERR] = result_list[3] + result_dict[CompareConst.ONE_THOUSANDTH_ERR_RATIO] = result_list[4] + result_dict[CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] = result_list[5] + result_dict[CompareConst.ACCURACY] = check_accuracy(result_list[0], result_list[2]) result_dict[CompareConst.ERROR_MESSAGE] = err_msg return pd.Series(result_dict) diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_graph_compare.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_graph_compare.py index e3fd9348efe..c2e7c9368c3 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_graph_compare.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_graph_compare.py @@ -78,7 +78,7 @@ class TestMsGraphCompare(unittest.TestCase): result_correct = ( f"[['{npu_file_path}', '{bench_file_path}', dtype('float16'), dtype('float16'), (10, 10), (10, 10), " - f"44.0, 44.0, 44.0, inf, 44.0, 44.0, 44.0, inf, 'Yes', '', 1.0, 0.0, 0.0, 1.0, 1.0]]") + f"44.0, 44.0, 44.0, inf, 44.0, 44.0, 44.0, inf, 'Yes', '', 1.0, 0.0, 0.0, 0.0, 1.0, 1.0]]") self.assertNotEqual(len(files), 0) self.assertEqual(result, result_correct) -- Gitee From 56fbae42a7d74aa162eb8dac7fe82a30f9d5b5b2 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 24 Feb 2025 19:56:23 +0800 Subject: [PATCH 06/25] compare add euclidean distance --- .../core/compare/multiprocessing_compute.py | 9 ++- .../msprobe/core/compare/npy_compare.py | 36 +---------- .../compare/test_acc_compare_npy_compare.py | 62 ------------------- 3 files changed, 8 insertions(+), 99 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index 20849afa920..f79671827c1 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -15,8 +15,11 @@ import multiprocessing from dataclasses import dataclass +from functools import partial + import pandas as pd from tqdm import tqdm + from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException from msprobe.core.common.const import CompareConst @@ -44,7 +47,7 @@ def _handle_multi_process(func, input_parma, result_df, lock): progress_bar = tqdm(total=len(result_df), desc="API/Module Item Compare Process", unit="row", ncols=100) - def update_progress(size, progress_lock): + def update_progress(size, progress_lock, extra_param=None): with progress_lock: progress_bar.update(size) @@ -54,8 +57,10 @@ def _handle_multi_process(func, input_parma, result_df, lock): result = pool.apply_async(func, args=(idx, op_name_mapping_dict, df_chunk, lock, input_parma), error_callback=err_call, - callback=update_progress(chunk_size, lock)) + callback=partial(update_progress, chunk_size, lock) + ) results.append(result) + final_results = [r.get() for r in results] pool.close() pool.join() diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index 7147f4d3dba..d060b4013f0 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -232,41 +232,7 @@ class GetEuclideanDistance(TensorComparisonBasic): def apply(self, n_value, b_value, relative_err): msg = '' - # 检查输入维度是否一致 - if n_value.shape != b_value.shape: - msg = f"Cannot compare by Euclidean Distance, shapes of tensors do not match: \ - npu:{n_value.shape} vs bench:{b_value.shape}" - return CompareConst.UNSUPPORTED, msg - - # 检查输入是否为空 - if n_value.size == 0 or b_value.size == 0: - msg = f"Cannot compare by Euclidean Distance, sizes of tensors must not be empty: \ - npu:{n_value.size} vs bench:{b_value.size}" - return CompareConst.NAN, msg - - # 检查是否包含 NaN 或 Inf - if np.any(np.isnan(n_value)) or np.any(np.isnan(b_value)): - msg = "Tensor contains NaN values." - return CompareConst.NAN, msg - if np.any(np.isinf(n_value)) or np.any(np.isinf(b_value)): - msg = "Tensor contains Inf values." - return CompareConst.NAN, msg - - # 处理零向量 - if np.all(n_value == 0) and np.all(b_value == 0): - return 0.0, "Zero tensors" - - # 输入为标量 - if np.ndim(n_value) == 0 or np.ndim(b_value) == 0: - msg = "Cannot compare by Euclidean Distance, input must be a vector, not a scalar." - return CompareConst.UNSUPPORTED, msg - - # 大数值溢出 - if np.any(np.abs(n_value) > 1e10) or np.any(np.abs(b_value) > 1e10): - msg = "tensors's values are large, which may cause overflow." - - # 计算欧式距离 - distance = np.linalg.norm(n_value - b_value) + distance = np.linalg.norm(n_value - b_value, ord=2) return distance, msg diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py index 5c10d96cb8d..c9096694a77 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py @@ -480,68 +480,6 @@ class TestGetEuclideanDistance(unittest.TestCase): def setUp(self): self.euc_distance = GetEuclideanDistance() - def test_shape_mismatch(self): - # 测试当两个张量的形状不匹配时,返回 UNSUPPORTED - n_value = np.array([1, 2, 3]) - b_value = np.array([1, 2]) - - result, msg = self.euc_distance.apply(n_value, b_value, None) - self.assertEqual(result, CompareConst.UNSUPPORTED) - self.assertIn("Cannot compare by Euclidean Distance", msg) - - def test_empty_tensor(self): - # 测试当输入的张量为空时,返回 NAN - n_value = np.array([]) - b_value = np.array([]) - - result, msg = self.euc_distance.apply(n_value, b_value, None) - self.assertEqual(result, CompareConst.NAN) - self.assertIn("sizes of tensors must not be empty", msg) - - def test_nan_in_tensor(self): - # 测试当张量包含 NaN 值时,返回 NAN - n_value = np.array([1, 2, np.nan]) - b_value = np.array([1, 2, 3]) - - result, msg = self.euc_distance.apply(n_value, b_value, None) - self.assertEqual(result, CompareConst.NAN) - self.assertIn("Tensor contains NaN values", msg) - - def test_inf_in_tensor(self): - # 测试当张量包含 Inf 值时,返回 NAN - n_value = np.array([1, 2, np.inf]) - b_value = np.array([1, 2, 3]) - - result, msg = self.euc_distance.apply(n_value, b_value, None) - self.assertEqual(result, CompareConst.NAN) - self.assertIn("Tensor contains Inf values", msg) - - def test_zero_tensors(self): - # 测试两个零张量的欧式距离 - n_value = np.array([0, 0, 0]) - b_value = np.array([0, 0, 0]) - - result, msg = self.euc_distance.apply(n_value, b_value, None) - self.assertEqual(result, 0.0) - self.assertIn("Zero tensors", msg) - - def test_scalars(self): - # 测试当输入是标量时,返回 UNSUPPORTED - n_value = np.array(5) - b_value = np.array(10) - - result, msg = self.euc_distance.apply(n_value, b_value, None) - self.assertEqual(result, CompareConst.UNSUPPORTED) - self.assertIn("input must be a vector, not a scalar", msg) - - def test_large_values(self): - # 测试当张量包含大值时,应该返回大数值溢出的警告 - n_value = np.array([1e11, 1e11, 1e11]) - b_value = np.array([1e10, 1e10, 1e10]) - - result, msg = self.euc_distance.apply(n_value, b_value, None) - self.assertIn("tensors's values are large", msg) - def test_euclidean_distance(self): # 测试计算两个张量之间的欧式距离 n_value = np.array([1, 2, 3]) -- Gitee From 02fa286f4b0216080d35f8222ab2a3ef14e971a1 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 24 Feb 2025 19:58:16 +0800 Subject: [PATCH 07/25] compare add euclidean distance --- debug/accuracy_tools/msprobe/core/compare/highlight.py | 1 - 1 file changed, 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/highlight.py b/debug/accuracy_tools/msprobe/core/compare/highlight.py index d95729c6d85..ead0f6bda87 100644 --- a/debug/accuracy_tools/msprobe/core/compare/highlight.py +++ b/debug/accuracy_tools/msprobe/core/compare/highlight.py @@ -17,7 +17,6 @@ import abc import math import multiprocessing import re -from abc import ABC from collections import namedtuple import numpy as np -- Gitee From da88a94a3195b69aa43d946a764aac8c16a72854 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 24 Feb 2025 20:05:22 +0800 Subject: [PATCH 08/25] compare add euclidean distance --- debug/accuracy_tools/msprobe/core/compare/highlight.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/highlight.py b/debug/accuracy_tools/msprobe/core/compare/highlight.py index ead0f6bda87..1983313249f 100644 --- a/debug/accuracy_tools/msprobe/core/compare/highlight.py +++ b/debug/accuracy_tools/msprobe/core/compare/highlight.py @@ -97,13 +97,6 @@ class CheckCosineSimilarity(HighlightCheck): "compared to the input/parameters's") -class CheckEuclideanDistance(HighlightCheck): - """检查欧式距离""" - - def apply(self, info, color_columns, dump_mode): - pass - - class CheckMaxRelativeDiff(HighlightCheck): """检查最大相对差异""" @@ -157,8 +150,7 @@ class HighlightRules: compare_rules = { "check_order_magnitude": CheckOrderMagnitude(), "check_one_thousand_error": CheckOneThousandErrorRatio(), - "check_cosine_similarity": CheckCosineSimilarity(), - "check_euclidean_distance": CheckEuclideanDistance() + "check_cosine_similarity": CheckCosineSimilarity() } # 统计量数据检查规则 summary_compare_rules = { -- Gitee From b3278e3e41bfd7a38b14e27f1511948979076d47 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Tue, 25 Feb 2025 11:48:50 +0800 Subject: [PATCH 09/25] compare add euclidean distance --- .../msprobe/core/compare/npy_compare.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index d060b4013f0..cfd0db3f1fe 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -70,7 +70,7 @@ def get_error_flag_and_msg(n_value, b_value, error_flag=False, error_file=None): error_flag = True return CompareConst.NONE, CompareConst.NONE, error_flag, err_msg if not n_value.shape: # 判断数据是否为0维张量 - err_msg = (f"This is type of 0-d tensor, can not calculate '{CompareConst.COSINE}', " + err_msg = (f"This is type of 0-d tensor, can not calculate '{CompareConst.COSINE}', {CompareConst.EUC_DIST}" f"'{CompareConst.ONE_THOUSANDTH_ERR_RATIO}' and '{CompareConst.FIVE_THOUSANDTHS_ERR_RATIO}'. ") error_flag = False # 0-d tensor 最大绝对误差、最大相对误差仍然支持计算,因此error_flag设置为False,不做统一处理 return n_value, b_value, error_flag, err_msg @@ -170,7 +170,7 @@ class TensorComparisonBasic(abc.ABC): """NPU和bench中npy数据的比较模板""" @abc.abstractmethod - def apply(self, n_value, b_value, relative_err): + def apply(self, n_value, b_value, relative_err, err_msg): raise NotImplementedError @@ -200,9 +200,9 @@ class GetCosineSimilarity(TensorComparisonBasic): return round(float(result), 6) return result - def apply(self, n_value, b_value, relative_err): - if not n_value.shape: - return CompareConst.UNSUPPORTED, "" + def apply(self, n_value, b_value, relative_err, err_msg): + if "This is type of 0-d tensor" in n_value: + return CompareConst.UNSUPPORTED, err_msg with np.errstate(divide="ignore", invalid="ignore"): if len(n_value) == 1: @@ -229,7 +229,7 @@ class GetCosineSimilarity(TensorComparisonBasic): class GetEuclideanDistance(TensorComparisonBasic): """计算欧式距离""" - def apply(self, n_value, b_value, relative_err): + def apply(self, n_value, b_value, relative_err, err_msg): msg = '' distance = np.linalg.norm(n_value - b_value, ord=2) @@ -240,7 +240,7 @@ class GetEuclideanDistance(TensorComparisonBasic): class GetMaxAbsErr(TensorComparisonBasic): """计算最大绝对误差""" - def apply(self, n_value, b_value, relative_err): + def apply(self, n_value, b_value, relative_err, err_msg): temp_res = n_value - b_value max_value = np.max(np.abs(temp_res)) if np.isnan(max_value): @@ -252,7 +252,7 @@ class GetMaxAbsErr(TensorComparisonBasic): class GetMaxRelativeErr(TensorComparisonBasic): """计算最大相对误差""" - def apply(self, n_value, b_value, relative_err): + def apply(self, n_value, b_value, relative_err, err_msg): max_relative_err = np.max(np.abs(relative_err)) if np.isnan(max_relative_err): msg = "Cannot compare by MaxRelativeError, the data contains nan/inf/-inf in dump data." @@ -266,9 +266,9 @@ class GetErrRatio(TensorComparisonBasic): def __init__(self, threshold): self.threshold = threshold - def apply(self, n_value, b_value, relative_err): - if not n_value.shape: - return CompareConst.UNSUPPORTED, "" + def apply(self, n_value, b_value, relative_err, err_msg): + if "This is type of 0-d tensor" in n_value: + return CompareConst.UNSUPPORTED, err_msg if not np.size(relative_err): return CompareConst.NAN, "" @@ -312,7 +312,7 @@ def compare_ops_apply(n_value, b_value, error_flag, err_msg): n_value, b_value = reshape_value(n_value, b_value) for op in CompareOps.compare_ops.values(): - result, msg = op.apply(n_value, b_value, relative_err) + result, msg = op.apply(n_value, b_value, relative_err, err_msg) result_list.append(result) err_msg += msg return result_list, err_msg -- Gitee From fc665daa328e8ef29655485a63a53632b64994d0 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Tue, 25 Feb 2025 14:23:44 +0800 Subject: [PATCH 10/25] compare add euclidean distance --- debug/accuracy_tools/msprobe/core/compare/npy_compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index cfd0db3f1fe..6728b8b35f4 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -201,7 +201,7 @@ class GetCosineSimilarity(TensorComparisonBasic): return result def apply(self, n_value, b_value, relative_err, err_msg): - if "This is type of 0-d tensor" in n_value: + if "This is type of 0-d tensor" in err_msg: return CompareConst.UNSUPPORTED, err_msg with np.errstate(divide="ignore", invalid="ignore"): @@ -267,7 +267,7 @@ class GetErrRatio(TensorComparisonBasic): self.threshold = threshold def apply(self, n_value, b_value, relative_err, err_msg): - if "This is type of 0-d tensor" in n_value: + if "This is type of 0-d tensor" in err_msg: return CompareConst.UNSUPPORTED, err_msg if not np.size(relative_err): -- Gitee From 23c9871ea5c66595fdcb81c9e228416bb7a3d12b Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Tue, 25 Feb 2025 14:25:27 +0800 Subject: [PATCH 11/25] compare add euclidean distance --- debug/accuracy_tools/msprobe/core/compare/npy_compare.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index 6728b8b35f4..68a2e9bea28 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -230,11 +230,12 @@ class GetEuclideanDistance(TensorComparisonBasic): """计算欧式距离""" def apply(self, n_value, b_value, relative_err, err_msg): - msg = '' + if "This is type of 0-d tensor" in err_msg: + return CompareConst.UNSUPPORTED, err_msg distance = np.linalg.norm(n_value - b_value, ord=2) - return distance, msg + return distance, "" class GetMaxAbsErr(TensorComparisonBasic): -- Gitee From f6ec2cddfbdfa8b0c9cb61791c2b0a430e9e5457 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Tue, 25 Feb 2025 15:13:56 +0800 Subject: [PATCH 12/25] compare add euclidean distance --- .../msprobe/core/compare/npy_compare.py | 2 +- .../compare/test_acc_compare_npy_compare.py | 69 +++++++++++++------ 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py index 68a2e9bea28..4103d361fec 100644 --- a/debug/accuracy_tools/msprobe/core/compare/npy_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/npy_compare.py @@ -70,7 +70,7 @@ def get_error_flag_and_msg(n_value, b_value, error_flag=False, error_file=None): error_flag = True return CompareConst.NONE, CompareConst.NONE, error_flag, err_msg if not n_value.shape: # 判断数据是否为0维张量 - err_msg = (f"This is type of 0-d tensor, can not calculate '{CompareConst.COSINE}', {CompareConst.EUC_DIST}" + err_msg = (f"This is type of 0-d tensor, can not calculate '{CompareConst.COSINE}', '{CompareConst.EUC_DIST}', " f"'{CompareConst.ONE_THOUSANDTH_ERR_RATIO}' and '{CompareConst.FIVE_THOUSANDTHS_ERR_RATIO}'. ") error_flag = False # 0-d tensor 最大绝对误差、最大相对误差仍然支持计算,因此error_flag设置为False,不做统一处理 return n_value, b_value, error_flag, err_msg diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py index c9096694a77..da315b657c8 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_npy_compare.py @@ -113,7 +113,7 @@ class TestUtilsMethods(unittest.TestCase): n_value, b_value, error_flag, err_msg = get_error_flag_and_msg(n_value, b_value, error_flag=error_flag) self.assertFalse(error_flag) - self.assertEqual(err_msg, "This is type of 0-d tensor, can not calculate 'Cosine', " + self.assertEqual(err_msg, "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', " "'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'. ") def test_get_error_flag_and_msg_shape_unmatch(self): @@ -239,15 +239,17 @@ class TestUtilsMethods(unittest.TestCase): b_value_1 = np.array(1) relative_err = get_relative_err(n_value_1, b_value_1) n_value_1, b_value_1 = reshape_value(n_value_1, b_value_1) - result, err_msg = op.apply(n_value_1, b_value_1, relative_err) + err_msg = "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'. " + result, err_msg = op.apply(n_value_1, b_value_1, relative_err, err_msg) self.assertEqual(result, CompareConst.UNSUPPORTED) - self.assertEqual(err_msg, "") + self.assertEqual(err_msg, "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'. ") n_value_2 = np.array([1, 2]) b_value_2 = np.array([1, 2]) relative_err = get_relative_err(n_value_2, b_value_2) n_value_2, b_value_2 = reshape_value(n_value_2, b_value_2) - result, err_msg = op.apply(n_value_2, b_value_2, relative_err) + err_msg = "" + result, err_msg = op.apply(n_value_2, b_value_2, relative_err, err_msg) self.assertEqual(result, 1.0) self.assertEqual(err_msg, "") @@ -255,7 +257,8 @@ class TestUtilsMethods(unittest.TestCase): b_value_3 = np.array([0, 0]) relative_err = get_relative_err(n_value_3, b_value_3) n_value_3, b_value_3 = reshape_value(n_value_3, b_value_3) - result, err_msg = op.apply(n_value_3, b_value_3, relative_err) + err_msg = "" + result, err_msg = op.apply(n_value_3, b_value_3, relative_err, err_msg) self.assertEqual(result, 1.0) self.assertEqual(err_msg, "") @@ -263,7 +266,8 @@ class TestUtilsMethods(unittest.TestCase): b_value_4 = np.array([1, 2]) relative_err = get_relative_err(n_value_4, b_value_4) n_value_4, b_value_4 = reshape_value(n_value_4, b_value_4) - result, err_msg = op.apply(n_value_4, b_value_4, relative_err) + err_msg = "" + result, err_msg = op.apply(n_value_4, b_value_4, relative_err, err_msg) self.assertEqual(result, CompareConst.NAN) self.assertEqual(err_msg, 'Cannot compare by Cosine Similarity, All the data is Zero in npu dump data.') @@ -271,7 +275,8 @@ class TestUtilsMethods(unittest.TestCase): b_value_5 = np.array([0, 0]) relative_err = get_relative_err(n_value_5, b_value_5) n_value_5, b_value_5 = reshape_value(n_value_5, b_value_5) - result, err_msg = op.apply(n_value_5, b_value_5, relative_err) + err_msg = "" + result, err_msg = op.apply(n_value_5, b_value_5, relative_err, err_msg) self.assertEqual(result, CompareConst.NAN) self.assertEqual(err_msg, 'Cannot compare by Cosine Similarity, All the data is Zero in Bench dump data.') @@ -282,7 +287,9 @@ class TestUtilsMethods(unittest.TestCase): b_value_1 = np.array([1]) relative_err = get_relative_err(n_value_1, b_value_1) n_value_1, b_value_1 = reshape_value(n_value_1, b_value_1) - result, err_msg = op.apply(n_value_1, b_value_1, relative_err) + err_msg = "" + + result, err_msg = op.apply(n_value_1, b_value_1, relative_err, err_msg) self.assertEqual(result, CompareConst.UNSUPPORTED) self.assertEqual(err_msg, "This is a 1-d tensor of length 1.") @@ -294,8 +301,9 @@ class TestUtilsMethods(unittest.TestCase): b_value = np.array([1, 1]) relative_err = get_relative_err(n_value, b_value) n_value, b_value = reshape_value(n_value, b_value) + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, CompareConst.NAN) self.assertEqual(err_msg, "Cannot compare by Cosine Similarity, the dump data has NaN.") @@ -319,8 +327,9 @@ class TestUtilsMethods(unittest.TestCase): b_value = np.array([0, 0]) relative_err = get_relative_err(n_value, b_value) n_value, b_value = reshape_value(n_value, b_value) + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, 2.0) self.assertEqual(err_msg, "") @@ -333,8 +342,9 @@ class TestUtilsMethods(unittest.TestCase): b_value = np.array([1, 1]) relative_err = get_relative_err(n_value, b_value) n_value, b_value = reshape_value(n_value, b_value) + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, CompareConst.NAN) self.assertEqual(err_msg, "Cannot compare by MaxAbsError, the data contains nan/inf/-inf in dump data.") @@ -347,8 +357,9 @@ class TestUtilsMethods(unittest.TestCase): b_value = np.array([1, 1]) relative_err = get_relative_err(n_value, b_value) n_value, b_value = reshape_value(n_value, b_value) + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, 1.0) self.assertEqual(err_msg, "") @@ -361,8 +372,9 @@ class TestUtilsMethods(unittest.TestCase): b_value = np.array([1, 1]) relative_err = get_relative_err(n_value, b_value) n_value, b_value = reshape_value(n_value, b_value) + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, CompareConst.NAN) self.assertEqual(err_msg, "Cannot compare by MaxRelativeError, the data contains nan/inf/-inf in dump data.") @@ -375,8 +387,9 @@ class TestUtilsMethods(unittest.TestCase): b_value = np.array([1, 1]) relative_err = get_relative_err(n_value, b_value) n_value, b_value = reshape_value(n_value, b_value) + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, 0.5) self.assertEqual(err_msg, "") @@ -387,11 +400,12 @@ class TestUtilsMethods(unittest.TestCase): n_value = np.array(1) # 标量 b_value = np.array(1) relative_err = np.array(0) + err_msg = "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'. " - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, CompareConst.UNSUPPORTED) - self.assertEqual(err_msg, "") + self.assertEqual(err_msg, "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'. ") def test_GetThousandErrRatio_not_size(self): op = GetErrRatio(CompareConst.THOUSAND_RATIO_THRESHOLD) @@ -399,8 +413,9 @@ class TestUtilsMethods(unittest.TestCase): n_value = np.array([1, 2]) b_value = np.array([1, 2]) relative_err = np.array([]) # 空数组 + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, CompareConst.NAN) self.assertEqual(err_msg, "") @@ -412,8 +427,9 @@ class TestUtilsMethods(unittest.TestCase): b_value = np.array([1, 1]) relative_err = get_relative_err(n_value, b_value) n_value, b_value = reshape_value(n_value, b_value) + err_msg = "" - result, err_msg = op.apply(n_value, b_value, relative_err) + result, err_msg = op.apply(n_value, b_value, relative_err, err_msg) self.assertEqual(result, 0.5) self.assertEqual(err_msg, "") @@ -480,12 +496,25 @@ class TestGetEuclideanDistance(unittest.TestCase): def setUp(self): self.euc_distance = GetEuclideanDistance() - def test_euclidean_distance(self): + def test_euclidean_distance_normal(self): # 测试计算两个张量之间的欧式距离 n_value = np.array([1, 2, 3]) b_value = np.array([4, 5, 6]) + relative_err = None + err_msg = "" - result, msg = self.euc_distance.apply(n_value, b_value, None) + result, msg = self.euc_distance.apply(n_value, b_value, relative_err, err_msg) expected_distance = np.linalg.norm(n_value - b_value) self.assertEqual(result, expected_distance) self.assertEqual(msg, '') + + def test_euclidean_distance_0d_tensor(self): + # 测试计算两个张量之间的欧式距离 + n_value = np.array(1) + b_value = np.array(1) + relative_err = None + err_msg = "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'. " + + result, msg = self.euc_distance.apply(n_value, b_value, relative_err, err_msg) + self.assertEqual(result, CompareConst.UNSUPPORTED) + self.assertEqual(msg, "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'. ") -- Gitee From 1fc646f16ad906f08dab3a6d12501a004974e101 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Wed, 26 Feb 2025 10:05:25 +0800 Subject: [PATCH 13/25] compare add euclidean distance --- .../docs/10.accuracy_compare_PyTorch.md | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index e98478de0e1..e1d521d4472 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -257,11 +257,11 @@ PyTorch 精度比对是以 CPU 或 GPU 的计算结果为标杆,通过计算 统计量有 4 种:最大值(max)、最小值(min)、平均值(mean)和 L2-范数(L2 norm)。 -|dump 数据模式|Cosine (tensor 余弦相似度)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)|Data_Name (NPU 真实数据名)| -|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| -|真实数据模式|√|√|√|√|√|||√||||√|√|√|√| -|统计数据模式||||||√|√|√|||√||√|√|| -|MD5 模式|||||||||√|√|√|||√|| +|dump 数据模式|Cosine (tensor 余弦相似度)|EucDist (tensor 欧式距离)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)|Data_Name (NPU 真实数据名)| +|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| +|真实数据模式|√|√|√|√|√|√|||√||||√|√|√|√| +|统计数据模式|||||||√|√|√|||√||√|√|| +|MD5 模式||||||||||√|√|√|||√|| 上表中NPU_Stack_Info字段需要配置-s参数生成。 @@ -320,7 +320,7 @@ MD5 模式: 5. "This is empty data, can not compare.":读取到的数据为空(真实数据模式); 6. "Shape of NPU and bench Tensor do not match. Skipped.":NPU 和 Bench 的数据结构不一致(真实数据模式); 7. "The Position of inf or nan in NPU and bench Tensor do not match.":NPU 和 Bench 的数据有 nan/inf(真实数据模式); -8. "This is type of 0-d tensor, can not calculate 'Cosine', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); +8. "This is type of 0-d tensor, can not calculate 'Cosine', 'EucDist', 'One Thousandth Err Ratio' and 'Five Thousandths Err Ratio'.":NPU 为0维张量(真实数据模式); 9. "Dtype of NPU and bench Tensor do not match.":NPU 和 Bench 数据的数据类型不同(真实数据模式); 10. "":除以上情况的其余情况(真实数据模式、统计数据模式)。 @@ -330,13 +330,15 @@ MD5 模式: 1. Cosine:通过计算两个向量的余弦值来判断其相似度,数值越接近于 1 说明计算出的两个张量越相似,实际可接受阈值为大于 0.99。在计算中可能会存在 nan,主要由于可能会出现其中一个向量为 0。 -2. MaxAbsErr:当最大绝对误差越接近 0 表示其计算的误差越小,实际可接受阈值为小于 0.001。 +2. EucDist:通过计算两个向量的欧式距离来判断其相似度,定义为多维空间中两个点之间的绝对距离。数值越接近0,张量越相似,数值越大,差异越大。 -3. MaxRelativeErr:当最大相对误差越接近 0 表示其计算的误差越小。 +3. MaxAbsErr:当最大绝对误差越接近 0 表示其计算的误差越小,实际可接受阈值为小于 0.001。 + +4. MaxRelativeErr:当最大相对误差越接近 0 表示其计算的误差越小。 当 dump 数据中存在 0 或 Nan 时,比对结果中最大相对误差则出现 inf 或 Nan 的情况,属于正常现象。 -4. One Thousandth Err Ratio(相对误差小于千分之一的元素比例)、Five Thousandths Err Ratio(相对误差小于千分之五的元素比例)精度指标:是指 NPU 的 Tensor 中的元素逐个与对应的标杆数据对比,相对误差小于千分之一、千分之五的比例占总元素个数的比例。该数据仅作为精度下降趋势的参考,并不参与计算精度是否通过的判定。 +5. One Thousandth Err Ratio(相对误差小于千分之一的元素比例)、Five Thousandths Err Ratio(相对误差小于千分之五的元素比例)精度指标:是指 NPU 的 Tensor 中的元素逐个与对应的标杆数据对比,相对误差小于千分之一、千分之五的比例占总元素个数的比例。该数据仅作为精度下降趋势的参考,并不参与计算精度是否通过的判定。 ## 4 多卡比对结果提取汇总通信算子数据 -- Gitee From a7137d267ad81c72a35fea1f475290038253f9ba Mon Sep 17 00:00:00 2001 From: zhouxianqi <13165993773@163.com> Date: Wed, 26 Feb 2025 16:26:49 +0800 Subject: [PATCH 14/25] bug_fix_for_matrix_rank --- .../cluster_analyse/analysis/comm_matrix_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py b/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py index a87803438ae..2ad5797cc92 100644 --- a/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py +++ b/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py @@ -100,7 +100,6 @@ class CommMatrixAnalysis(BaseAnalysis): tmp_link[f"{src_rank}-{dst_rank}"] = link_dict return tmp_link - project_local_global_rank_map = dict() default_value = { Constant.TRANSPORT_TYPE: '', Constant.TRANSIT_TIME_MS: 0, @@ -109,6 +108,7 @@ class CommMatrixAnalysis(BaseAnalysis): } for op_name, op_dict in step_dict.items(): link_info = defaultdict(lambda: copy.deepcopy(default_value)) + project_local_global_rank_map = dict() for rank_id, rank_dict in op_dict.items(): process_link_key(rank_id, rank_dict) step_dict[op_name] = convert_local_to_global_rank() -- Gitee From c857b5c0ee82bb3c3656b05b3bcac5786d70935c Mon Sep 17 00:00:00 2001 From: lcw Date: Tue, 25 Feb 2025 19:18:33 +0800 Subject: [PATCH 15/25] =?UTF-8?q?=E3=80=90Bugfix=E3=80=91=E8=B5=84?= =?UTF-8?q?=E6=96=99=E4=BF=AE=E6=94=B9=EF=BC=8C=E9=99=8D=E4=BD=8E=E6=BA=A2?= =?UTF-8?q?=E5=87=BA=E6=A3=80=E6=B5=8B=E5=8A=9F=E8=83=BD=E7=9A=84=E4=BC=98?= =?UTF-8?q?=E5=85=88=E7=BA=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/msprobe/README.md | 38 +++++++++++++------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/debug/accuracy_tools/msprobe/README.md b/debug/accuracy_tools/msprobe/README.md index 0e68d1f8d9b..e31490f01e9 100644 --- a/debug/accuracy_tools/msprobe/README.md +++ b/debug/accuracy_tools/msprobe/README.md @@ -83,21 +83,21 @@ PyTorch 场景的[离线预检](./docs/07.accuracy_checker_PyTorch.md)和[在线 MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore.md) -### 3 精度比对 +### 3 分级可视化构图比对 -该功能进行 PyTorch 整网 API 粒度的数据 dump、精度比对,进而定位训练场景下的精度问题。 +该功能将msprobe工具dump的精度数据进行解析,还原模型图结构,实现模型各个层级的精度数据比对,方便用户理解模型结构、分析精度问题。 -[PyTorch 场景的精度比对](./docs/10.accuracy_compare_PyTorch.md) +[PyTorch 场景的分级可视化构图比对](./docs/21.visualization_PyTorch.md) -[MindSpore 场景的精度比对](./docs/11.accuracy_compare_MindSpore.md) +[MindSpore 场景的分级可视化构图比对](./docs/22.visualization_MindSpore.md) -### 4 溢出检测与解析 +### 4 精度比对 -溢出检测与解析是在执行精度数据 dump 时,判断是否存在输入正常但输出存在溢出的 API,从而判断是否为正常溢出。对应 config.json 中的 overflow_check。 +该功能进行 PyTorch 整网 API 粒度的数据 dump、精度比对,进而定位训练场景下的精度问题。 -[PyTorch 场景的溢出检测与解析](./docs/12.overflow_check_PyTorch.md) +[PyTorch 场景的精度比对](./docs/10.accuracy_compare_PyTorch.md) -[MindSpore 场景的溢出检测与解析](./docs/13.overflow_check_MindSpore.md) +[MindSpore 场景的精度比对](./docs/11.accuracy_compare_MindSpore.md) ### 5 数据解析 @@ -129,27 +129,27 @@ MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore. [兼容 PyTorch 和 MindSpore 框架的训练状态监控](./docs/19.monitor.md) -### 10 分级可视化构图比对 - -该功能将msprobe工具dump的精度数据进行解析,还原模型图结构,实现模型各个层级的精度数据比对,方便用户理解模型结构、分析精度问题。 - -[PyTorch 场景的分级可视化构图比对](./docs/21.visualization_PyTorch.md) - -[MindSpore 场景的分级可视化构图比对](./docs/22.visualization_MindSpore.md) - - -### 11 单算子API自动生成脚本 +### 10 单算子API自动生成脚本 该功能将msprobe工具dump的精度数据进行解析,自动生成单API脚本,用于复现整网中出现的算子问题,降低用户复现问题的成本,供开发分析算子问题。 [PyTorch 单算子API自动生成脚本](./docs/23.generate_operator_PyTorch.md) -### 12 数码关联 +### 11 数码关联 该功能只支持 MindSpore 静态图场景,用于将IR图与dump数据进行关联,获取dump数据和代码调用栈的关联关系。 [MindSpore 场景的数码关联](./docs/24.code_mapping_Mindspore.md) +### 12 溢出检测与解析 + +溢出检测与解析是在执行精度数据 dump 时,判断是否存在输入正常但输出存在溢出的 API,从而判断是否为正常溢出。对应 config.json 中的 overflow_check。 +推荐直接使用[数据采集](#1-数据采集)功能采集统计量信息检测溢出问题。 + +[PyTorch 场景的溢出检测与解析](./docs/12.overflow_check_PyTorch.md) + +[MindSpore 场景的溢出检测与解析](./docs/13.overflow_check_MindSpore.md) + ## 📑 补充材料 [无标杆比对功能在 PyTorch 场景的性能基线报告](./docs/S02.report_free_benchmarking_validation_performance_baseline.md) -- Gitee From 44c10ee85be3856cb3d99971f9948746435747b4 Mon Sep 17 00:00:00 2001 From: jiangchao_j Date: Wed, 26 Feb 2025 17:47:06 +0800 Subject: [PATCH 16/25] fix l2norm bug in acl dump --- .../ccsrc/core/AclDumpDataProcessor.cpp | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/debug/accuracy_tools/msprobe/ccsrc/core/AclDumpDataProcessor.cpp b/debug/accuracy_tools/msprobe/ccsrc/core/AclDumpDataProcessor.cpp index 0fe3443fa1f..72178d6486a 100644 --- a/debug/accuracy_tools/msprobe/ccsrc/core/AclDumpDataProcessor.cpp +++ b/debug/accuracy_tools/msprobe/ccsrc/core/AclDumpDataProcessor.cpp @@ -56,23 +56,25 @@ constexpr const char* kStatsHeaderShape = "Shape"; constexpr const char* kStatsHeaderMax = "Max Value"; constexpr const char* kStatsHeaderMin = "Min Value"; constexpr const char* kStatsHeaderAvg = "Avg Value"; -constexpr const char* kStatsHeaderL2Norm = "L2 Norm Value"; +constexpr const char* kStatsHeaderL2Norm = "l2norm"; +constexpr const char* kStatsHeaderL2NormInCsv = "L2Norm Value"; constexpr const char* kStatsHeaderMD5 = "MD5 Value"; constexpr const char* kStatsHeaderNan = "Nan Count"; +constexpr const char* kStatsHeaderNanInCsv = "NaN Count"; constexpr const char* kStatsHeaderNegInf = "Negative Inf Count"; constexpr const char* kStatsHeaderPosInf = "Positive Inf Count"; constexpr const char* kRankId = "RANK_ID"; constexpr const char* kDigitalNumbers = "0123456789"; -static const std::map summaryOptionHeaderStrMap = { - {DebuggerSummaryOption::MAX, kStatsHeaderMax}, - {DebuggerSummaryOption::MIN, kStatsHeaderMin}, - {DebuggerSummaryOption::MEAN, kStatsHeaderAvg}, - {DebuggerSummaryOption::L2NORM, kStatsHeaderL2Norm}, - {DebuggerSummaryOption::NAN_CNT, kStatsHeaderNan}, - {DebuggerSummaryOption::NEG_INF_CNT, kStatsHeaderNegInf}, - {DebuggerSummaryOption::POS_INF_CNT, kStatsHeaderPosInf}, - {DebuggerSummaryOption::MD5, kStatsHeaderMD5}, +static const std::map> summaryOptionHeaderStrMap = { + {DebuggerSummaryOption::MAX, {kStatsHeaderMax, kStatsHeaderMax}}, + {DebuggerSummaryOption::MIN, {kStatsHeaderMin, kStatsHeaderMin}}, + {DebuggerSummaryOption::MEAN, {kStatsHeaderAvg, kStatsHeaderAvg}}, + {DebuggerSummaryOption::L2NORM, {kStatsHeaderL2Norm, kStatsHeaderL2NormInCsv}}, + {DebuggerSummaryOption::NAN_CNT, {kStatsHeaderNan, kStatsHeaderNanInCsv}}, + {DebuggerSummaryOption::NEG_INF_CNT, {kStatsHeaderNegInf, kStatsHeaderNegInf}}, + {DebuggerSummaryOption::POS_INF_CNT, {kStatsHeaderPosInf, kStatsHeaderPosInf}}, + {DebuggerSummaryOption::MD5, {kStatsHeaderMD5, kStatsHeaderMD5}}, }; class AclTensorStats { @@ -170,7 +172,7 @@ static std::map ParseTensorSummaryHeaderOrder(c for (uint32_t pos = 0; pos < segs.size(); ++pos) { const std::string& opt = segs[pos]; for (auto it = summaryOptionHeaderStrMap.begin(); it != summaryOptionHeaderStrMap.end(); ++it) { - if (opt == it->second) { + if (opt == it->second.first) { ret[pos] = it->first; break; } @@ -233,7 +235,7 @@ std::string AclTensorStats::GetCsvHeader() const ret.append("Op Type,Op Name,Task ID,Stream ID,Timestamp,Input/Output,Slot,Data Size,Data Type,Format,Shape"); for (auto it = stats.begin(); it != stats.end(); it++) { ret.append(","); - ret.append(summaryOptionHeaderStrMap.at(it->first)); + ret.append(summaryOptionHeaderStrMap.at(it->first).second); } ret.append("\n"); -- Gitee From 604447ed375723ced665bce776015eb342461fcc Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 15:11:25 +0800 Subject: [PATCH 17/25] compare add euclidean distance --- debug/accuracy_tools/msprobe/visualization/utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/msprobe/visualization/utils.py b/debug/accuracy_tools/msprobe/visualization/utils.py index 20a881e2cdb..acfc5b59124 100644 --- a/debug/accuracy_tools/msprobe/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/visualization/utils.py @@ -181,11 +181,8 @@ class GraphConst: STR_MAX_LEN = 50 SMALL_VALUE = 1e-3 MD5_INDEX_LIST = [CompareConst.RESULT] - REAL_DATA_INDEX_LIST = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, - CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] - SUMMARY_INDEX_LIST = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF, - CompareConst.NORM_DIFF, CompareConst.MAX_RELATIVE_ERR, CompareConst.MIN_RELATIVE_ERR, - CompareConst.MEAN_RELATIVE_ERR, CompareConst.NORM_RELATIVE_ERR] + REAL_DATA_INDEX_LIST = CompareConst.ALL_COMPARE_INDEX + SUMMARY_INDEX_LIST = CompareConst.SUMMARY_COMPARE_INDEX VALUE_INDEX_LIST = [Const.MAX, Const.MIN, Const.MEAN, Const.NORM] APIS_BETWEEN_MODULES = 'Apis_Between_Modules' NULL = 'null' -- Gitee From eeec5f495efa2c011baca0b79fae223bd25bfc91 Mon Sep 17 00:00:00 2001 From: zhouxianqi <13165993773@163.com> Date: Thu, 27 Feb 2025 15:37:18 +0800 Subject: [PATCH 18/25] cluster_adapt_msprof_text --- .../msprof_step_trace_time_adapter.py | 56 ++++++++++ .../analysis/step_trace_time_analysis.py | 34 +++++- .../cluster_analyse/cluster_analysis.py | 70 ++++++++---- .../msprof_data_preprocessor.py | 98 +++++++++++++++++ .../base_communication_group.py | 6 +- .../communication_json_group.py | 14 ++- .../msprof_communication_matrix_adapter.py | 102 ++++++++++++++++++ .../msprof_communication_time_adapter.py | 38 +++++++ .../msprof_analyze/prof_common/constant.py | 2 + profiler/msprof_analyze/prof_common/utils.py | 7 ++ 10 files changed, 396 insertions(+), 31 deletions(-) create mode 100644 profiler/msprof_analyze/cluster_analyse/analysis/msprof_step_trace_time_adapter.py create mode 100644 profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py create mode 100644 profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_matrix_adapter.py create mode 100644 profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_time_adapter.py diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/msprof_step_trace_time_adapter.py b/profiler/msprof_analyze/cluster_analyse/analysis/msprof_step_trace_time_adapter.py new file mode 100644 index 00000000000..5c34a0fb421 --- /dev/null +++ b/profiler/msprof_analyze/cluster_analyse/analysis/msprof_step_trace_time_adapter.py @@ -0,0 +1,56 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from msprof_analyze.cluster_analyse.prof_bean.step_trace_time_bean import StepTraceTimeBean +from msprof_analyze.prof_common.utils import convert_to_float +from msprof_analyze.prof_common.file_manager import FileManager + + +class MsprofStepTraceTimeAdapter: + COMPUTE = "Computing" + COMM_NOT_OVERLAP = "Communication(Not Overlapped)" + OVERLAPPED = "Overlapped" + COMMUNICATION = "Communication" + FREE = "Free" + STAGE = "Stage" + BUBBLE = "Bubble" + COMM_NOT_OVERLAP_EXCLUDE_RECEIVE = "Communication(Not Overlapped and Exclude Receive)" + PREPARE = "Preparing" + + def __init__(self, file_path_list): + self.file_path_list = file_path_list + self._data = {self.COMPUTE: 0, self.COMM_NOT_OVERLAP: 0, self.OVERLAPPED: 0, self.COMMUNICATION: 0, + self.FREE: 0, self.STAGE: 0, self.BUBBLE: 0, self.COMM_NOT_OVERLAP_EXCLUDE_RECEIVE: 0, + self.PREPARE: 0} + + def generate_step_trace_time_data(self): + json_str = [] + for file_path in self.file_path_list: + json_str.extend(FileManager.read_json_file(file_path)) + receive_comm = [] + analysis_data = {} + for data in json_str: + event_name = data.get("name", "") + if event_name in {self.COMMUNICATION, self.COMPUTE, self.FREE, self.COMM_NOT_OVERLAP}: + analysis_data.setdefault(event_name, []).append(data) + elif event_name.startswith('hcom_receive'): + receive_comm.append(data) + for event_type, event_list in analysis_data.items(): + self._data[event_type] = sum((convert_to_float(event.get("dur", 0)) for event in event_list)) + self._data[self.BUBBLE] = sum((convert_to_float(event.get("dur", 0)) for event in receive_comm)) + self._data[self.COMM_NOT_OVERLAP_EXCLUDE_RECEIVE] = self._data[self.COMM_NOT_OVERLAP] - self._data[self.BUBBLE] + self._data[self.OVERLAPPED] = self._data[self.COMMUNICATION] - self._data[self.COMM_NOT_OVERLAP] + e2e_time = self._data[self.FREE] + self._data[self.COMPUTE] + self._data[self.COMM_NOT_OVERLAP] + self._data[self.STAGE] = e2e_time - self._data[self.BUBBLE] + return [StepTraceTimeBean(self._data)] diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/step_trace_time_analysis.py b/profiler/msprof_analyze/cluster_analyse/analysis/step_trace_time_analysis.py index 5168f63aef5..44675137922 100644 --- a/profiler/msprof_analyze/cluster_analyse/analysis/step_trace_time_analysis.py +++ b/profiler/msprof_analyze/cluster_analyse/analysis/step_trace_time_analysis.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +import re from msprof_analyze.prof_common.db_manager import DBManager from msprof_analyze.cluster_analyse.common_func.utils import increase_shared_value @@ -21,6 +22,7 @@ from msprof_analyze.cluster_analyse.prof_bean.step_trace_time_bean import StepTr from msprof_analyze.prof_common.constant import Constant from msprof_analyze.prof_common.file_manager import FileManager from msprof_analyze.prof_common.logger import get_logger +from msprof_analyze.cluster_analyse.analysis.msprof_step_trace_time_adapter import MsprofStepTraceTimeAdapter logger = get_logger() @@ -40,6 +42,7 @@ class StepTraceTimeAnalysis: self.step_data_list = [] self.data_type = param.get(Constant.DATA_TYPE) self.distributed_args = None + self.is_msprof = param.get(Constant.IS_MSPROF) @staticmethod def get_max_data_row(data_group_list: list): @@ -50,6 +53,26 @@ class StepTraceTimeAnalysis: ret.append(max(item)) return ret + @staticmethod + def find_msprof_json(path): + msprof_pattern = r'^msprof_\d{14}\.json$' + msprof_slice_pattern = r'^msprof_slice_\d{1}_\d{14}\.json$' + msprof_dict, msprof_slice_dict = {}, {} + for file_name in os.listdir(path): + if re.match(msprof_pattern, file_name): + timestamp = re.search(r"\d{14}", file_name).group() + msprof_dict.setdefault(timestamp, []).append(os.path.join(path, file_name)) + elif re.match(msprof_slice_pattern, file_name): + timestamp = re.search(r"\d{14}", file_name).group() + msprof_slice_dict.setdefault(timestamp, []).append(os.path.join(path, file_name)) + if msprof_dict: + max_timestamp = max(msprof_dict.keys()) + return msprof_dict.get(max_timestamp) + if msprof_slice_dict: + max_timestamp = max(msprof_slice_dict.keys()) + return msprof_slice_dict.get(max_timestamp) + return [] + def run(self, completed_processes, lock): self.load_step_trace_time_data() self.analyze_step_time() @@ -132,9 +155,14 @@ class StepTraceTimeAnalysis: metadata = FileManager.read_json_file(metadata_path) self.distributed_args = metadata.get(Constant.DISTRIBUTED_ARGS, None) if metadata else None if self.data_type == Constant.TEXT: - step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) - if os.path.exists(step_time_file): - self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) + if self.is_msprof: + msprof_json = self.find_msprof_json(os.path.join(profiling_dir_path, "mindstudio_profiler_output")) + self.step_time_dict[rank_id] = MsprofStepTraceTimeAdapter( + msprof_json).generate_step_trace_time_data() + else: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) + if os.path.exists(step_time_file): + self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) else: step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER) diff --git a/profiler/msprof_analyze/cluster_analyse/cluster_analysis.py b/profiler/msprof_analyze/cluster_analyse/cluster_analysis.py index d7d71908506..1e90d0cad11 100644 --- a/profiler/msprof_analyze/cluster_analyse/cluster_analysis.py +++ b/profiler/msprof_analyze/cluster_analyse/cluster_analysis.py @@ -21,6 +21,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath( from msprof_analyze.cluster_analyse.analysis.analysis_facade import AnalysisFacade from msprof_analyze.cluster_analyse.cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor from msprof_analyze.cluster_analyse.cluster_data_preprocess.mindspore_data_preprocessor import MindsporeDataPreprocessor +from msprof_analyze.cluster_analyse.cluster_data_preprocess.msprof_data_preprocessor import MsprofDataPreprocessor from msprof_analyze.cluster_analyse.communication_group.communication_group_generator import CommunicationGroupGenerator from msprof_analyze.prof_common.additional_args_manager import AdditionalArgsManager from msprof_analyze.prof_common.constant import Constant @@ -47,6 +48,7 @@ ALL_FEATURE_LIST = COMM_FEATURE_LIST + get_all_recipes() class Interface: ASCEND_PT = "ascend_pt" ASCEND_MS = "ascend_ms" + PROF = "PROF_" def __init__(self, params: dict): self.collection_path = PathManager.get_realpath(params.get(Constant.PROFILING_PATH)) @@ -70,27 +72,38 @@ class Interface: def allocate_prof_data(self): ascend_pt_dirs = [] ascend_ms_dirs = [] + prof_dirs = [] for root, dirs, _ in os.walk(self.collection_path): for dir_name in dirs: if dir_name.endswith(self.ASCEND_PT): ascend_pt_dirs.append(os.path.join(root, dir_name)) if dir_name.endswith(self.ASCEND_MS): ascend_ms_dirs.append(os.path.join(root, dir_name)) + if dir_name.startswith(self.PROF): + prof_dirs.append(os.path.join(root, dir_name)) pytorch_processor = PytorchDataPreprocessor(ascend_pt_dirs) pt_data_map = pytorch_processor.get_data_map() - data_type = pytorch_processor.get_data_type() + pt_data_type = pytorch_processor.get_data_type() ms_data_map = MindsporeDataPreprocessor(ascend_ms_dirs).get_data_map() if pt_data_map and ms_data_map: logger.error("Can not analyze pytorch and mindspore meantime.") - return [] - return (pt_data_map, data_type) if pt_data_map else (ms_data_map, Constant.TEXT) + return {} + if pt_data_map: + return {Constant.DATA_MAP: pt_data_map, Constant.DATA_TYPE: pt_data_type, Constant.IS_MSPROF: False} + if ms_data_map: + return {Constant.DATA_MAP: ms_data_map, Constant.DATA_TYPE: Constant.TEXT, Constant.IS_MSPROF: False} + msprof_processor = MsprofDataPreprocessor(prof_dirs) + prof_data_map = msprof_processor.get_data_map() + prof_data_type = msprof_processor.get_data_type() + return {Constant.DATA_MAP: prof_data_map, Constant.DATA_TYPE: prof_data_type, Constant.IS_MSPROF: True} def run(self): PathManager.check_input_directory_path(self.collection_path) PathManager.check_input_directory_path(self.cluster_analysis_output_path) PathManager.check_path_owner_consistent([self.collection_path, self.cluster_analysis_output_path]) - data_map, data_type = self.allocate_prof_data() + data_dict = self.allocate_prof_data() + data_map, data_type = data_dict.get(Constant.DATA_MAP), data_dict.get(Constant.DATA_TYPE) if not data_map: logger.warning("Can not get rank info or profiling data.") return @@ -100,32 +113,43 @@ class Interface: params = { Constant.COLLECTION_PATH: self.collection_path, + Constant.ANALYSIS_MODE: self.analysis_mode, Constant.DATA_MAP: data_map, Constant.DATA_TYPE: data_type, - Constant.ANALYSIS_MODE: self.analysis_mode, + Constant.IS_MSPROF: data_dict.get(Constant.IS_MSPROF, False), Constant.CLUSTER_ANALYSIS_OUTPUT_PATH: self.cluster_analysis_output_path, Constant.DATA_SIMPLIFICATION: self.origin_params.get(Constant.DATA_SIMPLIFICATION, False), Constant.FORCE: self.force } - if self.analysis_mode in COMM_FEATURE_LIST: - FileManager.create_output_dir(self.cluster_analysis_output_path) - PathManager.check_path_writeable(self.cluster_analysis_output_path) - logger.info("Begin generate communication data.") - comm_data_dict = CommunicationGroupGenerator(params).generate() - logger.info("Communication data read completed.") - params[Constant.COMM_DATA_DICT] = comm_data_dict - AnalysisFacade(params).cluster_analyze() - logger.info("The cluster analysis result file has been generated: %s", - self.cluster_analysis_output_path) - return - - if data_type != Constant.DB: - logger.error("The current analysis node only supports DB as input data. Please check.") - return - FileManager.create_output_dir(self.cluster_analysis_output_path, is_overwrite=True) - self.origin_params.update(params) - AnalysisFacade(self.origin_params).recipe_analyze() + if data_type == Constant.TEXT: + if self.analysis_mode in COMM_FEATURE_LIST: + FileManager.create_output_dir(self.cluster_analysis_output_path) + PathManager.check_path_writeable(self.cluster_analysis_output_path) + logger.info("Begin generate communication data.") + comm_data_dict = CommunicationGroupGenerator(params).generate() + logger.info("Communication data read completed.") + params[Constant.COMM_DATA_DICT] = comm_data_dict + AnalysisFacade(params).cluster_analyze() + logger.info("The cluster analysis result file has been generated: %s", + self.cluster_analysis_output_path) + else: + logger.error("The current analysis node only supports DB as input data. Please check.") + else: + if self.analysis_mode in COMM_FEATURE_LIST: + FileManager.create_output_dir(self.cluster_analysis_output_path) + PathManager.check_path_writeable(self.cluster_analysis_output_path) + logger.info("Begin generate communication data.") + comm_data_dict = CommunicationGroupGenerator(params).generate() + logger.info("Communication data read completed.") + params[Constant.COMM_DATA_DICT] = comm_data_dict + AnalysisFacade(params).cluster_analyze() + logger.info("The cluster analysis result file has been generated: %s", + self.cluster_analysis_output_path) + else: + FileManager.create_output_dir(self.cluster_analysis_output_path, is_overwrite=True) + self.origin_params.update(params) + AnalysisFacade(self.origin_params).recipe_analyze() def cluster_analysis_main(): diff --git a/profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py b/profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py new file mode 100644 index 00000000000..04953b0edfc --- /dev/null +++ b/profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py @@ -0,0 +1,98 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import re +from collections import defaultdict + +from msprof_analyze.cluster_analyse.cluster_data_preprocess.data_preprocessor import DataPreprocessor +from msprof_analyze.prof_common.constant import Constant +from msprof_analyze.prof_common.logger import get_logger +from msprof_analyze.prof_common.file_manager import FileManager + +logger = get_logger() + + +class MsprofDataPreprocessor(DataPreprocessor): + DEVICE_HEAD = "device_" + INFO_JSON_PATTERN = r"^info\.json\.\d{1,2}$" + DB_PATTERN = r"^msprof_\d{1,20}\.db$" + + def __init__(self, path_list: list): + super().__init__(path_list) + self.data_type = set() + + def get_data_map(self) -> dict: + prof_data_uid = defaultdict(list) + prof_data_rank = defaultdict(list) + for dir_name in self.path_list: + info_json_file = self._find_info_json_file(dir_name) + if not info_json_file: + logger.error(f"Profiling data in not completed, please check the info.json file in the path {dir_name}") + continue + + if self._check_db_type(dir_name): + self.data_type.add(Constant.DB) + elif os.path.exists(os.path.join(dir_name, "mindstudio_profiler_output")): + if os.path.exists(os.path.join(dir_name, "analyze")): + self.data_type.add(Constant.TEXT) + else: + logger.error(f"The profiling data has not been fully parsed. You can parse it by executing " + f"the following command: msprof --analyze=on --output={dir_name}") + continue + else: + logger.error(f"The profiling data has not been fully parsed. You can parse it by executing " + f"the following command: msprof --export=on --output={dir_name}; " + f"msprof --analyze=on --output={dir_name}") + continue + info_json = FileManager.read_json_file(info_json_file) + rank_id = info_json.get("rank_id") + if rank_id != Constant.INVALID_RETURN: + prof_data_rank[rank_id].append(dir_name) + continue + host_id = info_json.get("hostUid") + device_id = int(os.path.basename(info_json_file).split(".")[-1]) + prof_data_uid[(host_id, device_id)].append(dir_name) + + if prof_data_rank: + for rank_id, dir_list in prof_data_rank.items(): + dir_list.sort(key=lambda x: x.split('_')[-2]) + self.data_map[rank_id] = dir_list[0] + else: + ordered_keys = sorted(prof_data_uid.keys(), key=lambda x: (x[0], x[1])) + rank_id = 0 + for key in ordered_keys: + dir_list = prof_data_uid[key] + dir_list.sort(key=lambda x: x.split('_')[-2]) + self.data_map[rank_id] = dir_list[0] + rank_id += 1 + return self.data_map + + def get_data_type(self): + if len(self.data_type) == 1: + return self.data_type.pop() + return Constant.INVALID + + def _find_info_json_file(self, dir_name): + for file_name in os.listdir(dir_name): + for device_file in os.listdir(os.path.join(dir_name, file_name)): + if re.match(self.INFO_JSON_PATTERN, device_file): + return os.path.join(dir_name, file_name, device_file) + return None + + def _check_db_type(self, dir_name): + for file_name in os.listdir(dir_name): + if re.match(self.DB_PATTERN, file_name): + return True + return False diff --git a/profiler/msprof_analyze/cluster_analyse/communication_group/base_communication_group.py b/profiler/msprof_analyze/cluster_analyse/communication_group/base_communication_group.py index 2c02bfdbf1b..0df5214eb49 100644 --- a/profiler/msprof_analyze/cluster_analyse/communication_group/base_communication_group.py +++ b/profiler/msprof_analyze/cluster_analyse/communication_group/base_communication_group.py @@ -39,6 +39,7 @@ class BaseCommunicationGroup: self.data_map = params.get(Constant.DATA_MAP) self.data_type = params.get(Constant.DATA_TYPE) self.analysis_mode = params.get(Constant.ANALYSIS_MODE) + self.is_msprof = params.get(Constant.IS_MSPROF) self.rank_comm_dir_dict = {} self.p2p_link = [] self.collective_group_dict = defaultdict(set) @@ -54,8 +55,9 @@ class BaseCommunicationGroup: comm_op_dirs = [] for rank_id, profiling_dir_path in self.data_map.items(): if self.data_type == Constant.TEXT: - comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON) - matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON) + output_dir = "analyze" if self.is_msprof else Constant.SINGLE_OUTPUT + comm_dir = os.path.join(profiling_dir_path, output_dir, Constant.COMM_JSON) + matrix_dir = os.path.join(profiling_dir_path, output_dir, Constant.COMM_MATRIX_JSON) else: comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER) matrix_dir = comm_dir diff --git a/profiler/msprof_analyze/cluster_analyse/communication_group/communication_json_group.py b/profiler/msprof_analyze/cluster_analyse/communication_group/communication_json_group.py index 2975050da07..e6fd3b41eea 100644 --- a/profiler/msprof_analyze/cluster_analyse/communication_group/communication_json_group.py +++ b/profiler/msprof_analyze/cluster_analyse/communication_group/communication_json_group.py @@ -15,9 +15,13 @@ import os from copy import deepcopy - + from msprof_analyze.cluster_analyse.communication_group.base_communication_group import BaseCommunicationGroup from msprof_analyze.prof_common.file_manager import FileManager +from msprof_analyze.cluster_analyse.communication_group.msprof_communication_matrix_adapter import \ + MsprofCommunicationMatrixAdapter +from msprof_analyze.cluster_analyse.communication_group.msprof_communication_time_adapter import \ + MsprofCommunicationTimeAdapter class CommunicationJsonGroup(BaseCommunicationGroup): @@ -42,7 +46,11 @@ class CommunicationJsonGroup(BaseCommunicationGroup): comm_data = {} matrix_data = {} if os.path.exists(comm_json_path) and self.analysis_mode in ["all", "communication_time"]: - comm_data = FileManager.read_json_file(comm_json_path) + comm_data = MsprofCommunicationTimeAdapter( + comm_json_path).generate_comm_time_data() if self.is_msprof else FileManager.read_json_file( + comm_json_path) if os.path.exists(matrix_json_path) and self.analysis_mode in ["all", "communication_matrix"]: - matrix_data = FileManager.read_json_file(matrix_json_path) + matrix_data = MsprofCommunicationMatrixAdapter( + matrix_json_path).generate_comm_matrix_data() if self.is_msprof else FileManager.read_json_file( + matrix_json_path) return rank_id, comm_data, matrix_data diff --git a/profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_matrix_adapter.py b/profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_matrix_adapter.py new file mode 100644 index 00000000000..7f1aef80b96 --- /dev/null +++ b/profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_matrix_adapter.py @@ -0,0 +1,102 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from collections import defaultdict + +from msprof_analyze.prof_common.file_manager import FileManager +from msprof_analyze.prof_common.constant import Constant +from msprof_analyze.prof_common.logger import get_logger + +from msprof_analyze.prof_common.utils import compute_ratio + +logger = get_logger() + + +class MsprofCommunicationMatrixAdapter: + P2P_HCOM = ["hcom_send", "hcom_receive", "hcom_batchsendrecv"] + HCCL_PATTERN = r"send|reduce|invalid|broadcast|allreduce|" \ + r"receive|allgather|reducescatter|scatter|alltoall|alltoallv|alltoallvc|batchsendrecv" + BANDWIDTH_GB_S = "Bandwidth(GB/s)" + TRANSPORT_TYPE = "Transport Type" + TRANSIT_SIZE_MB = "Transit Size(MB)" + TRANSIT_TIME_MS = "Transit Time(ms)" + + def __init__(self, file_path): + self.file_path = file_path + + def generate_comm_matrix_data(self): + output_comm_matrix = {"step": {Constant.P2P: {}, Constant.COLLECTIVE: {}}} + comm_matrix_data = FileManager.read_json_file(self.file_path) + split_comm_dict = {Constant.P2P: {}, Constant.COLLECTIVE: {}} + for communication_op, comm_matrix_info in comm_matrix_data.items(): + lower_op_name = communication_op.lower() + if any(lower_op_name.startswith(start_str) for start_str in self.P2P_HCOM): + split_comm_dict[Constant.P2P][communication_op] = comm_matrix_info + elif lower_op_name.startswith(Constant.TOTAL): + continue + else: + split_comm_dict[Constant.COLLECTIVE][communication_op] = comm_matrix_info + output_comm_matrix["step"][Constant.P2P] = self.integrate_matrix_data( + self.get_comm_type(split_comm_dict[Constant.P2P])) + output_comm_matrix["step"][Constant.COLLECTIVE] = self.integrate_matrix_data( + self.get_comm_type(split_comm_dict[Constant.COLLECTIVE])) + return output_comm_matrix + + def get_comm_type(self, op_data: dict) -> dict: + new_comm_op_dict = defaultdict(list) + for communication_op, communication_info in op_data.items(): + match_obj = re.compile(self.HCCL_PATTERN).search((communication_op.lower())) + if match_obj: + comm_op_type = match_obj.group() + else: + comm_op_type = communication_op.split("__")[0] + logger.warning(f"Unknown communication op type: {comm_op_type}") + for link, data in communication_info.items(): + new_comm_op_name = (comm_op_type, communication_op.split("@")[-1], link) + data['Op Name'] = communication_op.split("@")[0] + new_comm_op_dict[new_comm_op_name].append(data) + return new_comm_op_dict + + def integrate_matrix_data(self, new_comm_op_dict: dict): + """integrate the matrix data""" + comm_op_dict = defaultdict(dict) + for new_comm_op_name, data in new_comm_op_dict.items(): + data.sort(key=lambda x: x[self.BANDWIDTH_GB_S], reverse=True) + t_type = data[0].get(self.TRANSPORT_TYPE, '') + t_size = sum(x.get(self.TRANSIT_SIZE_MB, 0) for x in data) + t_time = sum(x.get(self.TRANSIT_TIME_MS, 0) for x in data) + bandwidth = compute_ratio(t_size, t_time) + + link = new_comm_op_name[2] + new_comm_op_name_top1 = f'{new_comm_op_name[0]}-top1@{new_comm_op_name[1]}' + new_comm_op_name_middle = f'{new_comm_op_name[0]}-middle@{new_comm_op_name[1]}' + new_comm_op_name_bottom1 = f'{new_comm_op_name[0]}-bottom1@{new_comm_op_name[1]}' + new_comm_op_name_bottom2 = f'{new_comm_op_name[0]}-bottom2@{new_comm_op_name[1]}' + new_comm_op_name_bottom3 = f'{new_comm_op_name[0]}-bottom3@{new_comm_op_name[1]}' + new_comm_op_name_total = f'{new_comm_op_name[0]}-total@{new_comm_op_name[1]}' + comm_op_dict[new_comm_op_name_top1].update({link: data[0]}) + comm_op_dict[new_comm_op_name_middle].update({link: data[len(data) // 2]}) + comm_op_dict[new_comm_op_name_bottom1].update({link: data[-1]}) + comm_op_dict[new_comm_op_name_total].update({link: { + self.TRANSPORT_TYPE: t_type, + self.TRANSIT_SIZE_MB: t_size, + self.TRANSIT_TIME_MS: t_time, + self.BANDWIDTH_GB_S: bandwidth + }}) + if len(data) >= 2: + comm_op_dict[new_comm_op_name_bottom2].update({link: data[-2]}) + if len(data) >= 3: + comm_op_dict[new_comm_op_name_bottom3].update({link: data[-3]}) + return comm_op_dict diff --git a/profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_time_adapter.py b/profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_time_adapter.py new file mode 100644 index 00000000000..7b63b700f5c --- /dev/null +++ b/profiler/msprof_analyze/cluster_analyse/communication_group/msprof_communication_time_adapter.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from msprof_analyze.prof_common.file_manager import FileManager +from msprof_analyze.prof_common.constant import Constant + + +class MsprofCommunicationTimeAdapter: + P2P_HCOM = ["hcom_send", "hcom_receive", "hcom_batchsendrecv"] + TOTAL = "total" + + def __init__(self, file_path): + self.file_path = file_path + + def generate_comm_time_data(self): + output_communication = {"step": {Constant.P2P: {}, Constant.COLLECTIVE: {}}} + communication_data = FileManager.read_json_file(self.file_path) + for communication_op, communication_info in communication_data.items(): + lower_op_name = communication_op.lower() + if any(lower_op_name.startswith(start_str) for start_str in self.P2P_HCOM): + output_communication["step"][Constant.P2P][communication_op] = communication_info + elif lower_op_name.startswith(self.TOTAL): + continue + else: + output_communication["step"][Constant.COLLECTIVE][communication_op] = communication_info + + return output_communication diff --git a/profiler/msprof_analyze/prof_common/constant.py b/profiler/msprof_analyze/prof_common/constant.py index 5353fc6d40f..f34aeade895 100644 --- a/profiler/msprof_analyze/prof_common/constant.py +++ b/profiler/msprof_analyze/prof_common/constant.py @@ -61,6 +61,7 @@ class Constant(object): # communication P2P = "p2p" COLLECTIVE = "collective" + TOTAL = "total" STEP_ID = "step_id" RANK_ID = "rank_id" GROUP_NAME = "group_name" @@ -97,6 +98,7 @@ class Constant(object): TRANSPORT_TYPE = "Transport Type" COMM_DATA_DICT = "comm_data_dict" DATA_TYPE = "data_type" + IS_MSPROF = "is_prof" # step time RANK = "rank" diff --git a/profiler/msprof_analyze/prof_common/utils.py b/profiler/msprof_analyze/prof_common/utils.py index 005d8505c9c..5c083256633 100644 --- a/profiler/msprof_analyze/prof_common/utils.py +++ b/profiler/msprof_analyze/prof_common/utils.py @@ -91,3 +91,10 @@ def convert_to_int(num): except (ValueError, NameError): logger.error(f"Can not convert %s to int", num) return 0 + + +def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < 1e-15: + return 0 + else: + return round(dividend / divisor, 4) -- Gitee From 572b0b6fa96a8df74c44990b62ad2e0abacdcf9a Mon Sep 17 00:00:00 2001 From: zhouxianqi <13165993773@163.com> Date: Thu, 27 Feb 2025 17:20:03 +0800 Subject: [PATCH 19/25] mstx_sum support range --- .../cluster_analyse/common_func/context.py | 7 +- .../recipes/mstx_sum/mstx_sum.py | 109 +++++++++++------- ...tx_mark_export.py => mstx_event_export.py} | 50 +++++++- 3 files changed, 119 insertions(+), 47 deletions(-) rename profiler/msprof_analyze/prof_exports/{mstx_mark_export.py => mstx_event_export.py} (58%) diff --git a/profiler/msprof_analyze/cluster_analyse/common_func/context.py b/profiler/msprof_analyze/cluster_analyse/common_func/context.py index b41972c0d21..cde351508c0 100644 --- a/profiler/msprof_analyze/cluster_analyse/common_func/context.py +++ b/profiler/msprof_analyze/cluster_analyse/common_func/context.py @@ -84,7 +84,12 @@ class ConcurrentContext(Context): def map(self, func, *iterables, **kwargs): partial_func = partial(func, **kwargs) - return list(self._executor.map(partial_func, *iterables)) + try: + res = list(self._executor.map(partial_func, *iterables)) + except Exception as err: + logger.error(err) + return [] + return res def wait(self, waitable): return waitable diff --git a/profiler/msprof_analyze/cluster_analyse/recipes/mstx_sum/mstx_sum.py b/profiler/msprof_analyze/cluster_analyse/recipes/mstx_sum/mstx_sum.py index bfbcc6ffb49..db6aae0de86 100644 --- a/profiler/msprof_analyze/cluster_analyse/recipes/mstx_sum/mstx_sum.py +++ b/profiler/msprof_analyze/cluster_analyse/recipes/mstx_sum/mstx_sum.py @@ -21,7 +21,7 @@ from msprof_analyze.cluster_analyse.common_func.utils import describe_duration from msprof_analyze.cluster_analyse.recipes.base_recipe_analysis import BaseRecipeAnalysis from msprof_analyze.prof_common.constant import Constant from msprof_analyze.prof_common.logger import get_logger -from msprof_analyze.prof_exports.mstx_mark_export import MstxMarkExport +from msprof_analyze.prof_exports.mstx_event_export import MstxMarkExport, MstxRangeExport from msprof_analyze.prof_exports.mstx_step_export import MstxStepExport logger = get_logger() @@ -43,16 +43,28 @@ def format_mark_info(df: pd.DataFrame, start_idx, stop_idx, name) -> MarkInfo: ) -def rename_mark_msg_name(mark_stats_df: pd.DataFrame): +def format_range_info(df: pd.DataFrame, idx, name) -> MarkInfo: + range_series = df.iloc[idx] + return MarkInfo( + name=name, + framework_duration=float(0), + cann_duration=float(range_series["cann_end_ts"] - range_series["cann_start_ts"]), + device_duration=float(range_series["device_end_ts"] - range_series["device_start_ts"]), + tid=range_series["tid"], + start_ns=range_series["cann_start_ts"] + ) + + +def rename_mark_msg_name(mstx_stats_df: pd.DataFrame): msg_idx_counter = {} - for idx, mark_info in enumerate(mark_stats_df.itertuples(index=False)): + for idx, mark_info in enumerate(mstx_stats_df.itertuples(index=False)): msg_idx_counter.setdefault(mark_info.step_id, {}).setdefault(mark_info.name, []).append(idx) for msg_dict in msg_idx_counter.values(): for msg, idx_list in msg_dict.items(): if len(idx_list) <= 1: continue for i, idx in enumerate(idx_list): - mark_stats_df.loc[idx, 'name'] = f"{msg}_{i}" + mstx_stats_df.loc[idx, 'name'] = f"{msg}_{i}" def compute_step_id(mark_stat, step_stats_df: pd.DataFrame): @@ -80,6 +92,45 @@ def format_columns(df: pd.DataFrame): return formatted_df[cols] +def handle_mark_data(mark_df: pd.DataFrame, rank_id: int) -> list: + res = [] + mark_df["framework_ts"] = mark_df["framework_ts"].astype("int64") + mark_info = {} + mismatch_msg = [] + for idx, row in enumerate(mark_df.itertuples(index=False)): + if row.msg.endswith(MstxSum.START_SUFFIX): + msg = row.msg[:-len(MstxSum.START_SUFFIX)] + mark_info.setdefault(row.tid, {}).setdefault(msg, []).append(idx) + elif row.msg.endswith(MstxSum.STOP_SUFFIX): + msg = row.msg[:-len(MstxSum.STOP_SUFFIX)] + idx_list = mark_info.get(row.tid, {}).get(msg, []) + if not idx_list: + mismatch_msg.append((row.msg, idx)) + continue + start_idx = idx_list.pop() + res.append(format_mark_info(mark_df, start_idx, idx, msg)) + + # 统计未匹配上的mark信息 + for msg_info in mark_info.values(): + for msg, idx_list in msg_info.items(): + if not idx_list: + continue + mismatch_msg.extend((msg + MstxSum.START_SUFFIX, idx) for idx in idx_list) + if mismatch_msg: + mismatch_msg.sort(key=lambda msg: msg[1]) + logger.warning(f"The following mark messages do not match anyone in " + f"rank {rank_id}: {','.join(msg[0] for msg in mismatch_msg)}.") + + return res + + +def handle_range_data(range_df: pd.DataFrame) -> list: + res = [] + for idx, row in enumerate(range_df.itertuples(index=False)): + res.append(format_range_info(range_df, idx, row.msg)) + return res + + class MstxSum(BaseRecipeAnalysis): TABLE_FRAMEWORK_STATS = "MSTXAllFrameworkStats" TABLE_CANN_STATS = "MSTXAllCannStats" @@ -159,40 +210,18 @@ class MstxSum(BaseRecipeAnalysis): if step_df is None or step_df.empty: step_df = pd.DataFrame({"start_ns": [0], "end_ns": [float("inf")], "step_id": [0]}) mark_df = MstxMarkExport(profiler_db_path, analysis_class, step_range).read_export_db() - if mark_df is None or mark_df.empty: - logger.warning(f"There is no mark data in {profiler_db_path}.") + range_df = MstxRangeExport(profiler_db_path, analysis_class, step_range).read_export_db() + mstx_res = [] + if not mark_df.empty: + mstx_res += handle_mark_data(mark_df, rank_id) + if not range_df.empty: + mstx_res += handle_range_data(range_df) + if not mstx_res: + logger.warning(f"There is no mstx data in {profiler_db_path}.") return None - mark_df["framework_ts"] = mark_df["framework_ts"].astype("int64") - - mark_info = {} - mark_res = [] - mismatch_msg = [] - for idx, row in enumerate(mark_df.itertuples(index=False)): - if row.msg.endswith(MstxSum.START_SUFFIX): - msg = row.msg[:-len(MstxSum.START_SUFFIX)] - mark_info.setdefault(row.tid, {}).setdefault(msg, []).append(idx) - elif row.msg.endswith(MstxSum.STOP_SUFFIX): - msg = row.msg[:-len(MstxSum.STOP_SUFFIX)] - idx_list = mark_info.get(row.tid, {}).get(msg, []) - if not idx_list: - mismatch_msg.append((row.msg, idx)) - continue - start_idx = idx_list.pop() - mark_res.append(format_mark_info(mark_df, start_idx, idx, msg)) - - # 统计未匹配上的mark信息 - for msg_info in mark_info.values(): - for msg, idx_list in msg_info.items(): - if not idx_list: - continue - mismatch_msg.extend((msg + MstxSum.START_SUFFIX, idx) for idx in idx_list) - if mismatch_msg: - mismatch_msg.sort(key=lambda msg: msg[1]) - logger.warning(f"The following mark messages do not match anyone in " - f"rank {rank_id}: {','.join(msg[0] for msg in mismatch_msg)}.") - - mark_stats_df = pd.DataFrame(mark_res).assign(Rank=rank_id) - mark_stats_df["step_id"] = mark_stats_df.apply(compute_step_id, axis=1, step_stats_df=step_df) - rename_mark_msg_name(mark_stats_df) - mark_stats_df = format_columns(mark_stats_df).set_index("Name", drop=True) - return mark_stats_df + + mstx_stats_df = pd.DataFrame(mstx_res).assign(Rank=rank_id) + mstx_stats_df["step_id"] = mstx_stats_df.apply(compute_step_id, axis=1, step_stats_df=step_df) + rename_mark_msg_name(mstx_stats_df) + mstx_stats_df = format_columns(mstx_stats_df).set_index("Name", drop=True) + return mstx_stats_df diff --git a/profiler/msprof_analyze/prof_exports/mstx_mark_export.py b/profiler/msprof_analyze/prof_exports/mstx_event_export.py similarity index 58% rename from profiler/msprof_analyze/prof_exports/mstx_mark_export.py rename to profiler/msprof_analyze/prof_exports/mstx_event_export.py index 6a7f8d0c6d2..97c3813b7eb 100644 --- a/profiler/msprof_analyze/prof_exports/mstx_mark_export.py +++ b/profiler/msprof_analyze/prof_exports/mstx_event_export.py @@ -16,7 +16,7 @@ from msprof_analyze.prof_exports.base_stats_export import BaseStatsExport from msprof_analyze.prof_common.constant import Constant -QUERY = """ +MARK_QUERY = """ WITH FRAMEWORK_API AS ( SELECT @@ -46,7 +46,8 @@ LEFT JOIN LEFT JOIN STRING_IDS AS MSG_IDS ON MSTX_EVENTS.message == MSG_IDS.id -{} +WHERE + MSTX_EVENTS.eventType == 3 {} ORDER BY MSTX_EVENTS.startNs """ @@ -61,9 +62,46 @@ class MstxMarkExport(BaseStatsExport): def get_query_statement(self): if self._step_range: filter_statement_1 = f"WHERE PYTORCH_API.startNs >= {self._step_range.get(Constant.START_NS)} " \ - f"and PYTORCH_API.startNs <= {self._step_range.get(Constant.END_NS)}" - filter_statement_2 = f"WHERE MSTX_EVENTS.startNs >= {self._step_range.get(Constant.START_NS)} " \ - f"and MSTX_EVENTS.startNs <= {self._step_range.get(Constant.END_NS)}" + f"AND PYTORCH_API.startNs <= {self._step_range.get(Constant.END_NS)}" + filter_statement_2 = f"AND MSTX_EVENTS.startNs >= {self._step_range.get(Constant.START_NS)} " \ + f"AND MSTX_EVENTS.startNs <= {self._step_range.get(Constant.END_NS)}" else: filter_statement_1, filter_statement_2 = "", "" - return QUERY.format(filter_statement_1, filter_statement_2) + return MARK_QUERY.format(filter_statement_1, filter_statement_2) + + +RANGE_QUERY = ''' +SELECT + MSG_IDS.value AS "msg", + MSTX_EVENTS.startNs AS "cann_start_ts", + MSTX_EVENTS.endNs AS "cann_end_ts", + TASK.startNs AS "device_start_ts", + TASK.endNs AS "device_end_ts", + MSTX_EVENTS.globalTid AS "tid" +FROM + MSTX_EVENTS +LEFT JOIN + TASK + ON MSTX_EVENTS.connectionId == TASK.connectionId +LEFT JOIN + STRING_IDS AS MSG_IDS + ON MSTX_EVENTS.message == MSG_IDS.id +WHERE + MSTX_EVENTS.eventType == 2 {} +AND + MSTX_EVENTS.connectionId != 4294967295 +ORDER BY + MSTX_EVENTS.startNs + ''' + + +class MstxRangeExport(BaseStatsExport): + + def __init__(self, db_path, recipe_name, step_range): + super().__init__(db_path, recipe_name, step_range) + self._query = self.get_query_statement() + + def get_query_statement(self): + filter_statement = f"AND MSTX_EVENTS.startNs >= {self._step_range.get(Constant.START_NS)} AND " \ + f"MSTX_EVENTS.startNs <= {self._step_range.get(Constant.END_NS)}" if self._step_range else "" + return RANGE_QUERY.format(filter_statement) -- Gitee From 3944c9dea10f64b9e24c5a3967b566afff16c789 Mon Sep 17 00:00:00 2001 From: curry3 <485078529@qq.com> Date: Wed, 26 Feb 2025 11:19:57 +0800 Subject: [PATCH 20/25] =?UTF-8?q?=E3=80=90feature=E3=80=91ms=E5=92=8Cpt?= =?UTF-8?q?=E8=A1=A5=E5=85=85API=E6=94=AF=E6=8C=81=E5=88=97=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_processor/pytorch_processor.py | 8 +- .../dump/hook_cell/support_wrap_ops.yaml | 50 ++-- .../pytorch/hook_module/support_wrap_ops.yaml | 242 ++++++++++++------ 3 files changed, 197 insertions(+), 103 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 64253aa4260..2cd98b12568 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -78,14 +78,16 @@ class PytorchDataProcessor(BaseDataProcessor): def analyze_device_in_kwargs(element): single_arg = {} single_arg.update({'type': "torch.device"}) - if not isinstance(element, str): + if isinstance(element, (int, str)): + single_arg.update({"value": element}) + elif isinstance(element, torch.device): if hasattr(element, "index"): device_value = element.type + ":" + str(element.index) else: device_value = element.type single_arg.update({"value": device_value}) else: - single_arg.update({"value": element}) + logger.debug(f"Device type {type(element)} is not supported.") return single_arg @staticmethod @@ -311,7 +313,7 @@ class TensorDataProcessor(PytorchDataProcessor): saved_tensor = tensor.clone().contiguous().detach() save_pt(saved_tensor, file_path) return single_arg - + def _analyze_numpy(self, ndarray, suffix): dump_data_name, file_path = self.get_save_file_path(suffix) save_pt(torch.tensor(ndarray), file_path) diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml index 723b0cbc93f..364062b4647 100644 --- a/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +++ b/debug/accuracy_tools/msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml @@ -564,15 +564,15 @@ tensor: - all - amax - amin + - angle - any - arccos - arccosh - - argmax - - angle - arcsin - arcsinh - arctan - arctanh + - argmax - argmin - argsort - asin @@ -582,19 +582,23 @@ tensor: - atanh - baddbmm - bernoulli + - bfloat16 - bincount - bitwise_and - bitwise_or - bitwise_xor - bmm - bool + - bool astype - broadcast_to + - byte - ceil - - cholesky_solve - cholesky + - cholesky_solve - clamp - clip - conj + - copy - copysign - cos - cosh @@ -606,11 +610,13 @@ tensor: - deg2rad - diag - diagflat + - diagonal - diff - digamma - div - div_ - divide + - double - equal - erf - erfc @@ -618,13 +624,16 @@ tensor: - exp - expand_as - expm1 + - flatten - flip - fliplr - flipud + - float - float_power - floor - fmod - frac + - from_numpy - gather_elements - ge - geqrf @@ -648,12 +657,12 @@ tensor: - inner - int - inverse + - is_complex + - is_signed - isclose - isfinite - isinf - isnan - - is_complex - - is_signed - isneginf - isposinf - isreal @@ -704,28 +713,27 @@ tensor: - new_ones - new_zeros - nextafter - - norm - nonzero + - norm - not_equal - ormqr - permute - pow - prod - qr + - rad2deg - ravel - real - reciprocal - remainder - renorm - - rad2deg - - tile - repeat_interleave - reshape - reshape - - round + - resize - rot90 + - round - rsqrt - - sum_to_size - scatter - sgn - short @@ -745,7 +753,8 @@ tensor: - sub - sub_ - subtract - - subtract + - sum + - sum_to_size - svd - swapaxes - swapdims @@ -753,13 +762,13 @@ tensor: - take - tan - tanh - - trace - - swapaxes + - tensor_split - tile + - to - topk - - tril - - tensor_split + - trace - transpose + - tril - true_divide - trunc - unbind @@ -769,17 +778,6 @@ tensor: - view - where - xlogy - - from_numpy - - std - - take - - var - - all - - any - - copy - - diagonal - - flatten - - resize - - sum mint.ops: - abs diff --git a/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml index 4bc22f51ceb..91eb016284a 100644 --- a/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/hook_module/support_wrap_ops.yaml @@ -149,9 +149,10 @@ tensor: - __bool__ - __div__ - __eq__ + - __floordiv__ - __ge__ - - __gt__ - __getitem__ + - __gt__ - __iadd__ - __iand__ - __idiv__ @@ -160,23 +161,33 @@ tensor: - __imod__ - __imul__ - __ior__ + - __ipow__ - __irshift__ - __isub__ - __ixor__ + - __le__ - __lshift__ + - __lt__ - __matmul__ - __mod__ - __mul__ + - __ne__ - __nonzero__ - __or__ + - __pow__ - __radd__ + - __rdiv__ + - __rmod__ - __rmul__ + - __ror__ + - __rpow__ - __rshift__ + - __rsub__ + - __rxor__ - __setitem__ - __sub__ - __truediv__ - __xor__ - - __pow__ - abs - abs_ - absolute @@ -199,12 +210,14 @@ tensor: - addmv_ - addr - addr_ + - adjoint - align_as - align_to - all - allclose - amax - amin + - aminmax - angle - any - arccos @@ -216,12 +229,15 @@ tensor: - arcsinh - arcsinh_ - arctan + - arctan2 + - arctan2_ - arctan_ - arctanh - arctanh_ - argmax - argmin - argsort + - argwhere - asin - asin_ - asinh @@ -236,39 +252,51 @@ tensor: - baddbmm_ - bernoulli - bernoulli_ + - bfloat16 - bincount - bitwise_and - bitwise_and_ + - bitwise_left_shift + - bitwise_left_shift_ - bitwise_not - bitwise_not_ - bitwise_or - bitwise_or_ + - bitwise_right_shift + - bitwise_right_shift_ - bitwise_xor - bitwise_xor_ - bmm + - bool - broadcast_to + - byte - cauchy_ - ceil - ceil_ + - cfloat + - char - cholesky + - cholesky_inverse + - cholesky_solve - chunk - clamp - - cholesky_solve - - cholesky_inverse - clamp_ - clamp_max - clamp_max_ - - clip - clamp_min - clamp_min_ + - clip - clip_ + - conj_physical - copysign - copysign_ + - corrcoef - cos - cos_ - cosh - cosh_ - count_nonzero + - cov - cummax - cummin - cumprod @@ -282,20 +310,23 @@ tensor: - diag_embed - diagflat - diagonal + - diagonal_scatter - diff - - dist - digamma - digamma_ + - dist - div - div_ - divide - divide_ - dot + - double + - dsplit - eig - eq - eq_ - - erf - equal + - erf - erf_ - erfc - erfc_ @@ -304,18 +335,21 @@ tensor: - exp - exp2 - exp2_ - - expm1 - exp_ + - expand + - expand_as + - expm1 - expm1_ - exponential_ - fill_ - - fix - fill_diagonal_ + - fix - fix_ + - flatten - flip - fliplr - - flatten - flipud + - float - float_power - float_power_ - floor @@ -328,6 +362,7 @@ tensor: - fmod_ - frac - frac_ + - frexp - gather - gcd - gcd_ @@ -338,31 +373,37 @@ tensor: - ger - greater - greater_ - - gt - - gt_ - greater_equal - greater_equal_ + - gt + - gt_ + - half - hardshrink - heaviside - heaviside_ - histc + - histogram + - hsplit - hypot - hypot_ + - i0 + - i0_ - igamma - igamma_ - igammac - igammac_ - index_add - index_add_ - - inverse - index_copy - index_copy_ - index_fill - index_fill_ - index_put - index_put_ - - inner - index_select + - inner + - int + - inverse - isclose - isfinite - isinf @@ -380,7 +421,6 @@ tensor: - le_ - lerp - lerp_ - - where - less - less_ - less_equal @@ -397,43 +437,47 @@ tensor: - log_ - log_normal_ - log_softmax - - logcumsumexp - - logdet - logaddexp - logaddexp2 + - logcumsumexp + - logdet - logical_and - logical_and_ - logical_not - - logit - logical_not_ - logical_or - logical_or_ - logical_xor - logical_xor_ + - logit - logit_ - logsumexp + - long - lstsq - lt - lt_ + - lu - lu_solve - map2_ - map_ - masked_fill - - matmul - masked_fill_ - masked_scatter - masked_scatter_ - masked_select + - matmul - matrix_exp + - matrix_power - max - maximum - mean - - matrix_power - median - min - minimum - mm - mode + - moveaxis + - movedim - msort - mul - mul_ @@ -443,6 +487,11 @@ tensor: - mv - mvlgamma - mvlgamma_ + - nan_to_num + - nan_to_num_ + - nanmean + - nanmedian + - nanquantile - nansum - narrow - narrow_copy @@ -452,20 +501,29 @@ tensor: - neg_ - negative - negative_ + - nextafter + - nextafter_ - nonzero - norm - normal_ - not_equal - not_equal_ + - numpy + - orgqr + - ormqr + - outer - permute - pinverse - polygamma + - polygamma_ - pow - pow_ - - polygamma_ - prelu - prod - put_ + - q_zero_point + - qr + - quantile - rad2deg - rad2deg_ - ravel @@ -474,15 +532,16 @@ tensor: - relu - relu_ - remainder - - repeat_interleave - - reshape - remainder_ - renorm - renorm_ - repeat + - repeat_interleave + - reshape - reshape_as - resize_ - resize_as_ + - resolve_neg - roll - rot90 - round @@ -496,6 +555,7 @@ tensor: - select - sgn - sgn_ + - short - sigmoid - sigmoid_ - sign @@ -507,11 +567,13 @@ tensor: - sinc_ - sinh - sinh_ + - slice_scatter - slogdet - smm - softmax - solve - sort + - split - split_with_sizes - sqrt - sqrt_ @@ -521,21 +583,29 @@ tensor: - squeeze_ - sspaddmm - std + - stft + - stride - sub - sub_ + - subtract - sum - sum_to_size - svd + - swapaxes + - swapdims + - swapdims_ - symeig - t - t_ - take + - take_along_dim - tan - tan_ - tanh - tanh_ - tensor_split - tile + - to - topk - transpose - transpose_ @@ -543,8 +613,8 @@ tensor: - tril - tril_ - triu - - true_divide - triu_ + - true_divide - true_divide_ - trunc - trunc_ @@ -552,37 +622,20 @@ tensor: - unbind - unflatten - unfold + - unique + - unique_consecutive - unsafe_chunk - - unsqueeze - unsafe_split - unsafe_split_with_sizes + - unsqueeze + - unsqueeze_ - var - vdot - - unsqueeze_ - view_as + - vsplit + - where - xlogy - xlogy_ - - split - - stft - - nan_to_num - - dsplit - - orgqr - - bitwise_left_shift_ - - arctan2 - - histogram - - q_zero_point - - adjoint - - ormqr - - bitwise_right_shift_ - - nanquantile - - lu - - quantile - - arctan2_ - - qr - - diagonal_scatter - - corrcoef - - vsplit - - aminmax torch: - linalg.norm @@ -642,13 +695,14 @@ torch: - addmv - addmv_ - addr - - amax - affine_grid_generator - align_tensors - all - alpha_dropout - - amin - alpha_dropout_ + - amax + - amin + - aminmax - angle - any - arange @@ -661,12 +715,14 @@ torch: - arcsinh - arcsinh_ - arctan + - arctan2 - arctan_ - arctanh - arctanh_ - argmax - argmin - argsort + - argwhere - asin - asin_ - asinh @@ -687,13 +743,13 @@ torch: - batch_norm_elemt - batch_norm_gather_stats - batch_norm_gather_stats_with_counts - - bernoulli - batch_norm_stats - batch_norm_update_stats + - bernoulli - bilinear + - binary_cross_entropy_with_logits - bincount - binomial - - binary_cross_entropy_with_logits - bitwise_and - bitwise_not - bitwise_or @@ -739,9 +795,9 @@ torch: - conv_transpose1d - conv_transpose2d - conv_transpose3d - - cos - convolution - copysign + - cos - cos_ - cosh - cosh_ @@ -755,14 +811,16 @@ torch: - cummin - cumprod - cumsum + - cumulative_trapezoid - deg2rad - deg2rad_ - det - diag - diag_embed - - diff - diagflat - diagonal + - diagonal_scatter + - diff - digamma - dist - div @@ -771,12 +829,15 @@ torch: - dropout - dropout_ - dsmm + - dsplit - dstack - eig - einsum - embedding - embedding_bag - embedding_renorm_ + - empty + - empty_like - eq - equal - erf @@ -791,12 +852,12 @@ torch: - expm1 - expm1_ - eye - - feature_dropout - feature_alpha_dropout - feature_alpha_dropout_ + - feature_dropout - feature_dropout_ - - fix - fill_ + - fix - fix_ - flatten - flip @@ -811,8 +872,9 @@ torch: - fmod - frac - frac_ - - full + - frexp - frobenius_norm + - full - full_like - gather - gcd @@ -824,8 +886,8 @@ torch: - greater_equal - grid_sampler - grid_sampler_2d - - group_norm - grid_sampler_3d + - group_norm - gru - gru_cell - gt @@ -835,23 +897,29 @@ torch: - heaviside - hinge_embedding_loss - histc + - histogram + - histogramdd - hsmm + - hsplit - hspmm - hstack - hypot + - i0 + - i0_ - igamma - igammac - index_add - index_copy - - inner - index_fill - index_put - index_put_ - index_select + - inner - instance_norm - inverse - isclose - isfinite + - isin - isinf - isnan - isneginf @@ -879,8 +947,8 @@ torch: - log1p_ - log2 - log2_ - - log_softmax - log_ + - log_softmax - logaddexp - logaddexp2 - logcumsumexp @@ -899,18 +967,18 @@ torch: - lt - lu_solve - lu_unpack - - masked_fill - margin_ranking_loss + - masked_fill - masked_scatter - masked_select - - matrix_exp - matmul + - matrix_exp - matrix_power - matrix_rank - max - max_pool1d - - max_pool2d - max_pool1d_with_indices + - max_pool2d - max_pool3d - maximum - mean @@ -929,18 +997,20 @@ torch: - mvlgamma - nan_to_num - nan_to_num_ + - nanmean - nanmedian + - nanquantile - nansum - narrow + - narrow_copy - native_batch_norm - native_group_norm - - narrow_copy - native_layer_norm - native_norm - ne - neg - - negative - neg_ + - negative - negative_ - nextafter - nonzero @@ -972,30 +1042,31 @@ torch: - ravel - real - reciprocal - - relu - reciprocal_ + - relu - relu_ - remainder - renorm - repeat_interleave - reshape - resize_as_ + - resolve_neg - roll - rot90 - round - round_ + - row_stack - rrelu - rrelu_ - rsqrt - - row_stack - rsqrt_ - rsub - saddmm - scalar_tensor - scatter - - select - scatter_add - searchsorted + - select - selu - selu_ - sgn @@ -1015,12 +1086,12 @@ torch: - solve - sort - sparse_coo_tensor - - square - split - split_with_sizes - spmm - sqrt - sqrt_ + - square - square_ - squeeze - sspaddmm @@ -1042,8 +1113,8 @@ torch: - tan_ - tanh - tanh_ - - tensordot - tensor_split + - tensordot - threshold - threshold_ - tile @@ -1059,19 +1130,21 @@ torch: - true_divide - trunc - trunc_ - - unique_consecutive - - xlogy - unbind + - unflatten + - unique_consecutive - unsafe_chunk - unsafe_split - - vander - - var - - vdot - unsafe_split_with_sizes - unsqueeze + - vander + - var - var_mean + - vdot + - vsplit - vstack - where + - xlogy - xlogy_ _VF: @@ -1165,6 +1238,27 @@ torch_npu: - npu_moe_finalize_routing - npu_moe_gating_top_k_softmax - npu_trans_quant_param + - npu_gelu + - npu_ffn + - npu_quant_matmul + - npu_format_cast_ + - npu_dynamic_quant + - npu_moe_compute_expert_tokens + - npu_weight_quant_batchmatmul + - npu_dynamic_quant_asymmetric + - npu_grouped_matmul + - npu_quant_scatter_ + - npu_group_quant + - npu_fused_infer_attention_score + - npu_quantize + - npu_fast_gelu + - npu_weight_quant_batchmatmul + - scatter_update + - scatter_update_ + - npu_moe_init_routing + - npu_scatter_nd_update_ + - npu_scatter_nd_update + - npu_prefetch aten: - signbit -- Gitee From 45afc5e983f6c0d3b19026b16d74c8ff5dac9762 Mon Sep 17 00:00:00 2001 From: qianggee Date: Fri, 28 Feb 2025 02:27:54 +0000 Subject: [PATCH 21/25] fix grad sync bug --- debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py index eea2bdbc2d2..286ec298ba2 100644 --- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py +++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py @@ -1052,7 +1052,7 @@ class TrainerMon: self.enable_megatron = True logger.info("megatron version is > core_r0.8.0 <= core_r0.9.0") except ImportError: - self.enable_megatron = False + self.enable_megatron = False | self.enable_megatron if not self.enable_megatron: self._hook_weights() -- Gitee From ef229d0fedd79b0750d160019398d7bcdb323fe3 Mon Sep 17 00:00:00 2001 From: zhouxianqi <13165993773@163.com> Date: Fri, 28 Feb 2025 10:32:22 +0800 Subject: [PATCH 22/25] update_msprof_analyze_whl --- profiler/msprof_analyze/README.md | 1 + profiler/msprof_analyze/version.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/profiler/msprof_analyze/README.md b/profiler/msprof_analyze/README.md index 7e2267a5559..d39aea89a52 100644 --- a/profiler/msprof_analyze/README.md +++ b/profiler/msprof_analyze/README.md @@ -117,6 +117,7 @@ Successfully installed msprof-analyze-{version} | profiler版本 | 发布日期 | 下载链接 | 校验码 | |------------|------------|-------------------------------------------------------------------------------------------------------------------------------------------| ------------------------------------------------------------ | +| 2.0.1 | 2025-02-28 | [msprof_analyze-2.0.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/2.0.1/msprof_analyze-2.0.1-py3-none-any.whl) | 82dfe2c779dbab9015f61d36ea0c32d832b6d182454b3f7db68e6c0ed49c0423 | | 2.0.0 | 2025-02-08 | [msprof_analyze-2.0.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/2.0.0/msprof_analyze-2.0.0-py3-none-any.whl) | 8e44e5f3e7681c377bb2657a600ad9841d3bed11061ddd7844c30e8a97242101 | | 1.3.4 | 2025-01-20 | [msprof_analyze-1.3.4-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.3.4/msprof_analyze-1.3.4-py3-none-any.whl) | 8de92188d1a97105fb14cadcb0875ccd5f66629ee3bb25f37178da1906f4cce2 | | 1.3.3 | 2024-12-26 | [msprof_analyze-1.3.3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.3.3/msprof_analyze-1.3.3-py3-none-any.whl) | 27676f2eee636bd0c65243f81e292c7f9d30d7f985c772ac9cbaf10b54d3584e | diff --git a/profiler/msprof_analyze/version.txt b/profiler/msprof_analyze/version.txt index 359a5b952d4..10bf840ed53 100644 --- a/profiler/msprof_analyze/version.txt +++ b/profiler/msprof_analyze/version.txt @@ -1 +1 @@ -2.0.0 \ No newline at end of file +2.0.1 \ No newline at end of file -- Gitee From 1291b93f40d6e4c143116f4d63166c8a91920730 Mon Sep 17 00:00:00 2001 From: zhouxianqi <13165993773@163.com> Date: Fri, 28 Feb 2025 10:36:27 +0800 Subject: [PATCH 23/25] base_recipe_analysis_adapt_msprof --- .../recipes/base_recipe_analysis.py | 47 +++++++++++++++---- .../msprof_analyze/prof_common/constant.py | 1 + .../prof_exports/base_stats_export.py | 3 ++ 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/profiler/msprof_analyze/cluster_analyse/recipes/base_recipe_analysis.py b/profiler/msprof_analyze/cluster_analyse/recipes/base_recipe_analysis.py index a8b50359253..ed60873a1ef 100644 --- a/profiler/msprof_analyze/cluster_analyse/recipes/base_recipe_analysis.py +++ b/profiler/msprof_analyze/cluster_analyse/recipes/base_recipe_analysis.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse import os +import re import shutil import sys import traceback @@ -42,6 +43,7 @@ class BaseRecipeAnalysis(ABC): self._recipe_name = params.get(Constant.RECIPE_NAME, "") self._parallel_mode = params.get(Constant.PARALLEL_MODE, "") self._export_type = params.get(Constant.EXPORT_TYPE, "") + self._is_msprof = params.get(Constant.IS_MSPROF) self._cluster_analysis_output_path = os.path.join( params.get(Constant.CLUSTER_ANALYSIS_OUTPUT_PATH, self._collection_dir), Constant.CLUSTER_ANALYSIS_OUTPUT) self._output_path = self._cluster_analysis_output_path if self._export_type == "db" else os.path.join( @@ -158,16 +160,40 @@ class BaseRecipeAnalysis(ABC): db_paths = [] for rank_id in rank_ids: rank_path = self._data_map[rank_id] - db_path = os.path.join(rank_path, Constant.SINGLE_OUTPUT, f"ascend_pytorch_profiler_{rank_id}.db") - if os.path.exists(db_path): - db_paths.append({Constant.RANK_ID: rank_id, Constant.PROFILER_DB_PATH: db_path, - Constant.STEP_RANGE: self._get_step_range(db_path)}) + db_path_dict = {Constant.RANK_ID: rank_id, Constant.PROFILER_DB_PATH: "", Constant.ANALYSIS_DB_PATH: "", + Constant.STEP_RANGE: {}} + profiler_db_path = self._get_profiler_db_path(rank_id, rank_path) + analysis_db_path = os.path.join(rank_path, "analyze", "communication_analyzer.db") if self._is_msprof \ + else os.path.join(rank_path, Constant.SINGLE_OUTPUT, f"analysis.db") + if os.path.exists(profiler_db_path): + db_path_dict[Constant.PROFILER_DB_PATH] = profiler_db_path + db_path_dict[Constant.STEP_RANGE] = self._get_step_range(profiler_db_path) else: - logger.warning(f"DB file not found, rank id: {rank_id}, db path: {db_path}.") + logger.warning(f"Profiler DB file not found, rank id: {rank_id}, db path: {profiler_db_path}.") + + if os.path.exists(analysis_db_path): + db_path_dict[Constant.ANALYSIS_DB_PATH] = analysis_db_path + else: + logger.warning(f"Analysis DB file not found, rank id: {rank_id}, db path: {analysis_db_path}.") + if db_path_dict.get(Constant.PROFILER_DB_PATH): + db_paths.append(db_path_dict) if invalid_rank_id: logger.warning(f"Invalid Rank id: [{','.join(invalid_rank_id)}].") return db_paths + def _get_profiler_db_path(self, rank_id, data_path): + if self._is_msprof: + msprof_db_pattern = r"^msprof_\d{14}\.db$" + msprof_db_list = [] + for file_name in os.listdir(data_path): + if re.match(msprof_db_pattern, file_name): + msprof_db_list.append(file_name) + if msprof_db_list: + msprof_db_list.sort(key=lambda x: x.split(".")[0].split("_")[-1]) + return os.path.join(data_path, msprof_db_list[-1]) + return os.path.join(data_path, "msprof_xx.db") + return os.path.join(data_path, Constant.SINGLE_OUTPUT, f"ascend_pytorch_profiler_{rank_id}.db") + def _get_step_range(self, db_path): step_range = {} if self._step_id == Constant.VOID_STEP: @@ -204,9 +230,14 @@ class BaseRecipeAnalysis(ABC): Extract the profiling data required for cluster analysis from each device, and then aggregate the results from each device to be processed by a reduce function. Params: - data_map: eg. {"RANK_ID": 1, - "profiler_db_path": "xxxx/ascend_pytorch_profiler_1.db", - "step_range": {"id": 2, "startNs": 12345, "endNs": 12443]} + data_map: eg1. {"RANK_ID": 1, + "profiler_db_path": "xxx/ASCEND_PROFILER_OUTPUT/ascend_pytorch_profiler_1.db", + "analysis_db_path": "xxx/ASCEND_PROFILER_OUTPUT/analysis.db", + "step_range": {"id": 2, "startNs": 12345, "endNs": 12443]} + eg2. {"RANK_ID": 1, + "profiler_db_path": "xxx/msprof_20250227145123.db", + "analysis_db_path": "xxx/analyze/communication_analyzer.db", + "step_range": {"id": 2, "startNs": 12345, "endNs": 12443]} analysis_class: hccl_sum, compute_op_sum, cann_api_sum, mstx_sum…… """ pass diff --git a/profiler/msprof_analyze/prof_common/constant.py b/profiler/msprof_analyze/prof_common/constant.py index f34aeade895..c04e429321d 100644 --- a/profiler/msprof_analyze/prof_common/constant.py +++ b/profiler/msprof_analyze/prof_common/constant.py @@ -423,6 +423,7 @@ class Constant(object): CONCURRENT_MODE = "concurrent" PROFILER_DB_PATH = "profiler_db_path" + ANALYSIS_DB_PATH = "analysis_db_path" RANK_LIST = "rank_list" EXPORT_TYPE = "export_type" EXTRA_ARGS = "args" diff --git a/profiler/msprof_analyze/prof_exports/base_stats_export.py b/profiler/msprof_analyze/prof_exports/base_stats_export.py index 65ccd69ecde..6e0ff5e211e 100644 --- a/profiler/msprof_analyze/prof_exports/base_stats_export.py +++ b/profiler/msprof_analyze/prof_exports/base_stats_export.py @@ -35,6 +35,9 @@ class BaseStatsExport: def read_export_db(self): try: + if not self._db_path: + logger.error("db path is None.") + return None query = self.get_query() if query is None: logger.error("query is None.") -- Gitee From d1cf36f94b72be63d241cdb0e9afeddbf011c3b2 Mon Sep 17 00:00:00 2001 From: Mrtutu Date: Thu, 27 Feb 2025 17:48:01 +0800 Subject: [PATCH 24/25] fix dyno param --- dynolog_npu/README.md | 121 ++++++++++++++---- .../dynolog_npu/cli/src/commands/nputrace.rs | 5 + dynolog_npu/dynolog_npu/cli/src/main.rs | 5 + 3 files changed, 103 insertions(+), 28 deletions(-) diff --git a/dynolog_npu/README.md b/dynolog_npu/README.md index 9cc015e66c6..d6ebd6f7ff0 100644 --- a/dynolog_npu/README.md +++ b/dynolog_npu/README.md @@ -85,32 +85,67 @@ nputrace子命令支持的参数选项 | 子命令 | 参数类型 | 说明 | |-------|-------|-------| -| record_shapes | action | 是否采集算子的InputShapes和InputTypes,设置参数采集,默认不采集 | -| profile_memory | action | 是否采集算子内存信息,设置参数采集,默认不采集 | -| with_stack | action | 是否采集Python调用栈,设置参数采集,默认不采集 | -| with_flops | action | 是否采集算子flops,设置参数采集,默认不采集 | -| with_modules | action | 是否采集modules层级的Python调用栈,设置参数采集,默认不采集 | +| job-id | u64 | 采集任务的job id,默认值0,dynolog原生参数 | +| pids | String | 采集任务的pid列表,多个pid用逗号分隔,默认值0,dynolog原生参数 | +| process-limit | u64 | 最大采集进程的数量,默认值3,dynolog原生参数 | +| profile-start-time | u64 | 用于同步采集的Unix时间戳,单位毫秒,默认值0,dynolog原生参数 | +| duration-ms | u64 | 采集的周期,单位毫秒,默认值500,dynolog原生参数 | +| iterations | i64 | 采集总迭代数,默认值-1,dynolog原生参数 | +| log-file | String | 采集落盘的路径,必选值 | +| start-step | u64 | 开始采集的迭代数,默认值0 | +| record-shapes | action | 是否采集算子的InputShapes和InputTypes,设置参数采集,默认不采集 | +| profile-memory | action | 是否采集算子内存信息,设置参数采集,默认不采集 | +| with-stack | action | 是否采集Python调用栈,设置参数采集,默认不采集 | +| with-flops | action | 是否采集算子flops,设置参数采集,默认不采集 | +| with-modules | action | 是否采集modules层级的Python调用栈,设置参数采集,默认不采集 | | analyse | action | 采集后是否自动解析,设置参数解析,默认不解析 | -| l2_cache | action | 是否采集L2 Cache数据,设置参数采集,默认不采集 | -| op_attr | action | 是否采集算子属性信息,设置参数采集,默认不采集 | -| data_simplification | String | 解析完成后是否数据精简,可选值范围[`true`, `false`],默认值`true` | +| l2-cache | action | 是否采集L2 Cache数据,设置参数采集,默认不采集 | +| op-attr | action | 是否采集算子属性信息,设置参数采集,默认不采集 | +| msprof-tx | action | 是否使能MSTX,设置参数采集,默认使能 | +| data-simplification | String | 解析完成后是否数据精简,可选值范围[`true`, `false`],默认值`true` | | activities | String | 控制CPU、NPU事件采集范围,可选值范围[`CPU,NPU`, `NPU,CPU`, `CPU`, `NPU`],默认值`CPU,NPU` | -| profiler_level | String | 控制profiler的采集等级,可选值范围[`Level_none`, `Level0`, `Level1`, `Level2`],默认值`Level0`| -| aic_metrics | String | AI Core的性能指标采集项,可选值范围[`AiCoreNone`, `PipeUtilization`, `ArithmeticUtilization`, `Memory`, `MemoryL0`, `ResourceConflictRatio`, `MemoryUB`, `L2Cache`, `MemoryAccess`],默认值`AiCoreNone`| -| export_type | String | profiler解析导出数据的类型,可选值范围[`Text`, `Db`],默认值`Text`| -| gc_detect_threshold | Option | GC检测阈值,单位ms,只采集超过阈值的GC事件。该参数为可选参数,默认不设置时不开启GC检测 | +| profiler-level | String | 控制profiler的采集等级,可选值范围[`Level_none`, `Level0`, `Level1`, `Level2`],默认值`Level0`| +| aic-metrics | String | AI Core的性能指标采集项,可选值范围[`AiCoreNone`, `PipeUtilization`, `ArithmeticUtilization`, `Memory`, `MemoryL0`, `ResourceConflictRatio`, `MemoryUB`, `L2Cache`, `MemoryAccess`],默认值`AiCoreNone`| +| export-type | String | profiler解析导出数据的类型,可选值范围[`Text`, `Db`],默认值`Text`| +| gc-detect-threshold | Option | GC检测阈值,单位ms,只采集超过阈值的GC事件。该参数为可选参数,默认不设置时不开启GC检测 | -- nputrace示例命令 +- nputrace使用方法 + +Step1: 拉起dynolog daemon进程 +```bash +# 方法1:使用systemd拉起service +# 修改配置文件/etc/dynolog.gflags, 使能ipc_monitor +echo "--enable_ipc_monitor" | sudo tee -a /etc/dynolog.gflags +sudo systemctl start dynolog + +# 方法2:命令行执行 +dynolog --enable-ipc-monitor + +#dynolog daemon的日志路径为:/var/log/dynolog.log +``` + +Step 2:使能dynolog trace dump环境变量 +```bash +export KINETO_USE_DAEMON=1 +``` + +Step 3: 拉起训练任务 +```bash +# 训练任务中需要使用pytorch的优化器/继承原生优化器 +bash train.sh +``` + +Step 4:使用dyno CLI动态触发trace dump ```bash -# 示例1:采集框架、CANN和device数据,同时采集完后自动解析以及解析完成不做数据精简,落盘路径为/tmp/profile_data -dyno nputrace --activities CPU,NPU --analyse --data_simplification false --log-file /tmp/profile_data +# 示例1:从第10个step开始采集,采集2个step,采集框架、CANN和device数据,同时采集完后自动解析以及解析完成不做数据精简,落盘路径为/tmp/profile_data +dyno nputrace --start-step 10 --iterations 2 --activities CPU,NPU --analyse --data-simplification false --log-file /tmp/profile_data -# 示例2:只采集CANN和device数据,同时采集完后自动解析以及解析完成后开启数据精简,落盘路径为/tmp/profile_data -dyno nputrace --activities NPU --analyse --data_simplification true --log-file /tmp/profile_data +# 示例2:从第10个step开始采集,采集2个step,只采集CANN和device数据,同时采集完后自动解析以及解析完成后开启数据精简,落盘路径为/tmp/profile_data +dyno nputrace --start-step 10 --iterations 2 --activities NPU --analyse --data-simplification true --log-file /tmp/profile_data -# 示例3:只采集CANN和device数据,只采集不解析,落盘路径为/tmp/profile_data -dyno nputrace --activities NPU --log-file /tmp/profile_data +# 示例3:从第10个step开始采集,采集2个step,只采集CANN和device数据,只采集不解析,落盘路径为/tmp/profile_data +dyno nputrace --start-step 10 --iterations 2 --activities NPU --log-file /tmp/profile_data ``` ### NPU Monitor功能 @@ -129,20 +164,50 @@ dyno npu-monitor [SUBCOMMANDS] npu-monitor子命令支持的参数选项 | 子命令 | 参数类型 | 说明 | |-------|-------|-------| -| npu_monitor_start | action | 开启性能监控,设置参数开启,默认不采集 | -| npu_monitor_stop | action | 停止性能监控,设置参数开启,默认不采集 | -| report_interval_s | int | 性能监控数据上报周期,单位s,需要在启动时设置。默认值60 | -| mspti_activity_kind | String | 性能监控数据上报数据类型,可以设置单个或多个,多个类型以逗号分隔,需要在启动时设置。可选值范围[`Marker`, `Kernel`, `API`, `Hccl`, `Memory`, `MemSet`, `MemCpy`] , 默认值`Marker`| +| npu-monitor-start | action | 开启性能监控,设置参数开启,默认不采集 | +| npu-monitor-stop | action | 停止性能监控,设置参数开启,默认不采集 | +| report-interval-s | int | 性能监控数据上报周期,单位s,需要在启动时设置。默认值60 | +| mspti-activity-kind | String | 性能监控数据上报数据类型,可以设置单个或多个,多个类型以逗号分隔,需要在启动时设置。可选值范围[`Marker`, `Kernel`, `API`, `Hccl`, `Memory`, `MemSet`, `MemCpy`] , 默认值`Marker`| -- npu-monitor示例命令 +- npu-monitor使用方法 +Step1: 拉起dynolog daemon进程 +```bash +# 方法1:使用systemd拉起service +# 修改配置文件/etc/dynolog.gflags, 使能ipc_monitor +echo "--enable_ipc_monitor" | sudo tee -a /etc/dynolog.gflags +sudo systemctl start dynolog + +# 方法2:命令行执行 +dynolog --enable-ipc-monitor + +#dynolog daemon的日志路径为:/var/log/dynolog.log +``` + +Step 2:使能dynolog trace dump环境变量 +```bash +export KINETO_USE_DAEMON=1 +``` + +Step 3: 拉起训练任务 +```bash +# 训练任务中需要使用pytorch的优化器/继承原生优化器 +bash train.sh +``` + +Step 4:使用dyno CLI使能npu-monitor ```bash # 示例1:开启性能监控,使用默认配置 -dyno npu-monitor --npu_monitor_start +dyno npu-monitor --npu-monitor-start # 示例2:暂停性能监控 -dyno npu-monitor --npu_monitor_stop +dyno npu-monitor --npu-monitor-stop + +# 示例3:性能监控过程中修改配置 +# 上报周期30s, 上报数据类型Marker和Kernel +dyno npu-monitor --report-interval-s 30 --mspti-activity-kind Marker,Kernel -# 示例3:开启性能监控,上报周期30s, 上报数据类型Marker和Kernel -dyno npu-monitor --npu_monitor_start 30 --mspti_activity_kind Marker,Kernel +# 示例4:性能监控开启时修改配置 +# 上报周期30s, 上报数据类型Marker和Kernel +dyno npu-monitor --npu-monitor-start --report-interval-s 30 --mspti-activity-kind Marker,Kernel ``` \ No newline at end of file diff --git a/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs b/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs index 4bf7132de33..f70923bca4c 100644 --- a/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs +++ b/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs @@ -55,6 +55,7 @@ pub struct NpuTraceOptions { pub aic_metrics: String, pub l2_cache: bool, pub op_attr: bool, + pub msprof_tx: bool, pub gc_detect_threshold: Option, pub data_simplification: String, pub export_type: String, @@ -75,6 +76,7 @@ PROFILE_PROFILER_LEVEL={} PROFILE_AIC_METRICS={} PROFILE_L2_CACHE={} PROFILE_OP_ATTR={} +PROFILE_MSPROF_TX={} PROFILE_GC_DETECT_THRESHOLD={} PROFILE_DATA_SIMPLIFICATION={} PROFILE_EXPORT_TYPE={}"#, @@ -89,6 +91,7 @@ PROFILE_EXPORT_TYPE={}"#, self.aic_metrics, self.l2_cache, self.op_attr, + self.msprof_tx, self.gc_detect_threshold.map_or("None".to_string(), |v| v.to_string()), self.data_simplification, self.export_type @@ -213,6 +216,7 @@ ACTIVITIES_ITERATIONS=1000"# aic_metrics: "AiCoreNone".to_string(), l2_cache: true, op_attr: true, + msprof_tx: true, gc_detect_threshold: 0.1, data_simplification: "true", export_type: "Text".to_string(), @@ -234,6 +238,7 @@ PROFILE_PROFILER_LEVEL=Level0 PROFILE_AIC_METRICS=AiCoreNone PROFILE_L2_CACHE=true PROFILE_OP_ATTR=true +PROFILE_MSPROF_TX=true PROFILE_GC_DETECT_THRESHOLD=0.1 PROFILE_DATA_SIMPLIFICATION=true PROFILE_EXPORT_TYPE=Text"# diff --git a/dynolog_npu/dynolog_npu/cli/src/main.rs b/dynolog_npu/dynolog_npu/cli/src/main.rs index 8bc4a2af0e2..9fdea3d1254 100644 --- a/dynolog_npu/dynolog_npu/cli/src/main.rs +++ b/dynolog_npu/dynolog_npu/cli/src/main.rs @@ -172,6 +172,9 @@ enum Command { /// Whether to collect op attributes. #[clap(long, action)] op_attr: bool, + /// Whether to enable MSTX. + #[clap(long, action)] + msprof_tx: bool, /// GC detect threshold. #[clap(long)] gc_detect_threshold: Option, @@ -290,6 +293,7 @@ fn main() -> Result<()> { aic_metrics, l2_cache, op_attr, + msprof_tx, gc_detect_threshold, data_simplification, export_type, @@ -318,6 +322,7 @@ fn main() -> Result<()> { aic_metrics, l2_cache, op_attr, + msprof_tx, gc_detect_threshold, data_simplification, export_type, -- Gitee From f54ae7594d04d3a42af973ec69fc8e793a2fe566 Mon Sep 17 00:00:00 2001 From: yangxinxian <947098055@qq.com> Date: Mon, 3 Mar 2025 10:15:32 +0800 Subject: [PATCH 25/25] Update PULL_REQUEST_TEMPLATE.zh-CN.md --- .gitee/PULL_REQUEST_TEMPLATE.zh-CN.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitee/PULL_REQUEST_TEMPLATE.zh-CN.md b/.gitee/PULL_REQUEST_TEMPLATE.zh-CN.md index fc9e09f3503..e9cc1deb82f 100644 --- a/.gitee/PULL_REQUEST_TEMPLATE.zh-CN.md +++ b/.gitee/PULL_REQUEST_TEMPLATE.zh-CN.md @@ -17,6 +17,11 @@ --- +## 3. 分支合并要求 +- [ ] **代码合并**(请确保将 master 分支的最新代码同步合并至 poc 分支及 pre-research 分支,同时保证 poc 分支的代码也已正确合并到 pre-research 分支。) + +--- + ## 3. 代码检视 - **要求:** - 合入代码超过 200 行,需三人以上会议检视。 -- Gitee