From 7b5764f6aa42141b67b6a7b7cf938bff0ce48512 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Mon, 7 Aug 2023 09:19:52 +0800 Subject: [PATCH 01/17] compare show detailed result --- .../api_accuracy_checker/compare/compare.py | 79 ++++++++++++------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index ed3c50a0cd3..c3b86b07ee3 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -1,12 +1,15 @@ # 进行比对及结果展示 -import os +import os from prettytable import PrettyTable -from api_accuracy_checker.compare.algorithm import compare_core, cosine_sim, cosine_standard +from api_accuracy_checker.compare.algorithm import compare_core, cosine_sim, cosine_standard, get_max_rel_err, \ + compare_builtin_type from api_accuracy_checker.common.utils import get_json_contents, print_error_log, print_info_log, write_csv from api_accuracy_checker.compare.compare_utils import CompareConst + class Comparator: TEST_FILE_NAME = "pretest_result.csv" + DETAIL_TEST_FILE_NAME = "pretest_details.csv" # consts for result csv COLUMN_API_NAME = "API name" COLUMN_FORWARD_SUCCESS = "Forward Test Success" @@ -15,13 +18,16 @@ class Comparator: def __init__(self, result_save_path, stack_info_json_path=None): self.save_path = os.path.join(result_save_path, self.TEST_FILE_NAME) + self.detail_save_path = os.path.join(result_save_path, self.TEST_FILE_NAME) if stack_info_json_path: self.stack_info = get_json_contents(stack_info_json_path) else: - self.stack_info = None + self.stack_info = None self.compare_alg = {} - self.compare_alg_names = [] - self.register_compare_algorithm("Cosine Similarity", cosine_sim, cosine_standard) + self.compare_alg_names = [] + self.register_compare_algorithm("Cosine Similarity", cosine_sim, cosine_standard) + self.register_compare_algorithm("Max Relative Error", get_max_rel_err, None) + self.register_compare_algorithm("Default: Isequal", compare_builtin_type, None) self.test_results = [] self.test_result_cnt = {"forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0} @@ -32,7 +38,7 @@ class Comparator: "backward_not_pass": self.test_result_cnt['backward_fail_num'], "forward_and_backward_not_pass": self.test_result_cnt['forward_and_backward_fail_num'], "pass": self.test_result_cnt['success_num'] - } + } tb = PrettyTable() tb.add_column("Category", list(res_dict.keys())) tb.add_column("statistics", list(res_dict.values())) @@ -40,14 +46,15 @@ class Comparator: print_info_log(info_tb) def write_compare_csv(self): - self.write_summary_csv() + self.write_summary_csv() + self.write_detail_csv() def write_summary_csv(self): test_rows = [[self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, self.COLUMN_BACKWARD_SUCCESS]] if self.stack_info: test_rows[0].append(self.COLUMN_STACK_INFO) for result in self.test_results: - name = result[0] + name = result[0] df_row = list(result[:3]) if self.stack_info: stack_info = "\n".join(self.stack_info[name]) @@ -55,6 +62,27 @@ class Comparator: test_rows.append(df_row) write_csv(test_rows, self.save_path) + def write_detail_csv(self): + test_rows = [["Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", + "Max Rel Err Similarity", "Max Rel Err Pass", "Max Rel Err Message", + "Default Isequal", "Default Isequal Pass", + "Default Isequal Message"]] # "Max Absolute Error", "Max Relative Error" + for test_result in self.test_results: + subject_prefix = test_result[0] + fwd_result = test_result[3] + bwd_result = test_result[4] + print(fwd_result, bwd_result) + if isinstance(fwd_result, list): + for i, test_subject in enumerate(fwd_result): + subject = subject_prefix + ".forward.output" + str(i) + test_rows.append([subject] + list(test_subject)) + if isinstance(bwd_result, list): + for i, test_subject in enumerate(bwd_result): + subject = subject_prefix + ".backward.output" + str(i) + test_rows.append([subject] + list(test_subject)) + + write_csv(test_rows, self.detail_save_path) + def record_results(self, *args): self.test_results.append(args) @@ -63,17 +91,11 @@ class Comparator: self.compare_alg_names.append(name) def compare_output(self, api_name, bench_out, npu_out, bench_grad=None, npu_grad=None): - if "dropout" in api_name: - is_fwd_success, fwd_compare_alg_results = self._compare_dropout(bench_out, npu_out) - else: - is_fwd_success, fwd_compare_alg_results = self._compare_core_wrapper(bench_out, npu_out) + is_fwd_success, fwd_compare_alg_results = self._compare_core_wrapper(bench_out, npu_out) if bench_grad and npu_grad: - if "dropout" in api_name: - is_bwd_success, bwd_compare_alg_results = self._compare_dropout(bench_grad[0], npu_grad[0]) - else: - is_bwd_success, bwd_compare_alg_results = self._compare_core_wrapper(bench_grad, npu_grad) + is_bwd_success, bwd_compare_alg_results = self._compare_core_wrapper(bench_grad, npu_grad) else: - is_bwd_success, bwd_compare_alg_results = CompareConst.NA, None + is_bwd_success, bwd_compare_alg_results = CompareConst.NA, None self.record_results(api_name, is_fwd_success, is_bwd_success, fwd_compare_alg_results, bwd_compare_alg_results) if is_fwd_success and is_bwd_success: self.test_result_cnt['success_num'] += 1 @@ -85,17 +107,14 @@ class Comparator: self.test_result_cnt['backward_fail_num'] += 1 def _compare_core_wrapper(self, bench_out, npu_out): - name = self.compare_alg_names[0] - detailed_result, test_success = compare_core(bench_out, npu_out, self.compare_alg[name][0]) - return test_success, detailed_result - - @staticmethod - def _compare_dropout(bench_out, npu_out): - tensor_num = bench_out.numel() - if tensor_num >= 100: - if abs((bench_out == 0).sum() - (npu_out == 0).cpu().sum()) / tensor_num < 0.1: - return True, 1 + detailed_result_total = [] + test_success_total = True + for name in self.compare_alg_names: + alg = self.compare_alg[name][0] + detailed_result, test_success = compare_core(bench_out, npu_out, alg) + if detailed_result_total: + for i in range(len(detailed_result_total)): + detailed_result_total[i] += detailed_result[i] else: - return False, 0 - else: - return True, 1 + detailed_result_total = detailed_result + return test_success_total, detailed_result_total -- Gitee From f5b41120c496481ce5cde819d418884baf076ca3 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Mon, 7 Aug 2023 15:04:26 +0800 Subject: [PATCH 02/17] compare algorithm --- .../api_accuracy_checker/compare/algorithm.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 7dcb1ff74e5..c237d218ab1 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -7,6 +7,8 @@ from api_accuracy_checker.common.utils import print_warn_log, Const def compare_torch_tensor(cpu_output, npu_output, compare_alg): if cpu_output.dtype == torch.bool: + if npu_output.dtype != torch.bool: + return CompareConst.NAN, False, f"Bench out dtype is torch.bool but npu output dtype is {npu_output.dtype}, cannot compare." return compare_bool_tensor(cpu_output, npu_output) return compare_alg(cpu_output, npu_output) @@ -16,39 +18,46 @@ def compare_bool_tensor(cpu_output, npu_output): cpu_shape = cpu_output.shape npu_shape = npu_output.shape if cpu_shape != npu_shape: - return error_rate, False + return error_rate, False, "" npu_data = npu_output.cpu().detach().numpy() bench_data = cpu_output.detach().numpy() data_size = bench_data.size error_nums = (bench_data != npu_data).sum() error_rate = float(error_nums / data_size) - return error_rate, error_rate < 0.001 + return error_rate, error_rate < 0.001, "" def get_max_rel_err(n_value, b_value): + msg = "" if not isinstance(n_value, np.ndarray) or not isinstance(b_value, np.ndarray): - print_warn_log("Max rel err only support numpy array!") - raise ValueError("Max rel err only support numpy array!") + msg = f"Max rel err only support numpy array! The actual type is {type(n_value)}, {type(b_value)}." + return CompareConst.NAN, False, msg + if n_value.shape != b_value.shape: + msg = f"Shape of npu and bench outputs don't match. NPU: {n_value.shape}, bench: {b_value.shape}." + return CompareConst.NAN, False, msg if n_value.dtype != b_value.dtype: - return CompareConst.NA, False - if n_value.dtype in Const.FLOAT_TYPE: - rel_err = np.abs((n_value - b_value) / (b_value + np.finfo(b_value.dtype).eps)).max() - return rel_err, rel_err < 0.001 - if np.all(n_value == b_value): - return 0, True - return 1, False + msg = f"Dtype of npu and bench outputs don't match. NPU: {n_value.dtype}, bench: {b_value.dtype}." + rel_err = np.abs((n_value - b_value) / (b_value + np.finfo(b_value.dtype).eps)).max() + bool_result = rel_err < 0.001 + + return reL_err, bool_result, msg + +def max_rel_err_standard(max_rel_errs): + bool_result = np.array(max_rel_errs) < 0.001 + return np.all(bool_result), bool_result def cosine_standard(compare_result): bool_result = np.array(compare_result) > 0.99 return np.all(bool_result), bool_result - def cosine_sim(cpu_output, npu_output): - n_value = npu_output.cpu().detach().numpy().reshape(-1) - b_value = cpu_output.detach().numpy().reshape(-1) + msg = "" + n_value = npu_output.reshape(-1) + b_value = cpu_output.reshape(-1) cos = CompareConst.NA np.seterr(divide="ignore", invalid="ignore") + if len(n_value) == 1: print_warn_log("All the data in npu dump data is scalar. Compare by relative error.") return get_max_rel_err(n_value, b_value) -- Gitee From feca79474c771bb4239aaded7b90485375ce3e4e Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Mon, 7 Aug 2023 16:06:20 +0800 Subject: [PATCH 03/17] fix typo --- .../api_accuracy_checker/compare/algorithm.py | 69 ++++++++++--------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 634a1c2dadb..629dadc082d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -6,9 +6,9 @@ from api_accuracy_checker.compare.compare_utils import CompareConst from api_accuracy_checker.common.utils import print_warn_log, Const def compare_torch_tensor(cpu_output, npu_output, compare_alg): - if cpu_output.dtype == torch.bool: - if npu_output.dtype != torch.bool: - return CompareConst.NAN, False, f"Bench out dtype is torch.bool but npu output dtype is {npu_output.dtype}, cannot compare." + if cpu_output.dtype == torch.bool or cpu_output.dtype == torch.uint8: + if npu_output.dtype != cpu_output.dtype: + return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but npu output dtype is {npu_output.dtype}, cannot compare." return compare_bool_tensor(cpu_output, npu_output) return compare_alg(cpu_output, npu_output) @@ -57,29 +57,29 @@ def cosine_sim(cpu_output, npu_output): b_value = cpu_output.reshape(-1) cos = CompareConst.NA np.seterr(divide="ignore", invalid="ignore") - + if n_value.shape != b_value.shape: + msg = f"Shape of npu and bench outputs don't match. NPU: {n_value.shape}, bench: {b_value.shape}." + return -1, False, msg if len(n_value) == 1: - print_warn_log("All the data in npu dump data is scalar. Compare by relative error.") - return get_max_rel_err(n_value, b_value) - if n_value.dtype == np.uint8: - return compare_uint8_data(n_value, b_value) - n_value = n_value / (np.max(np.abs(n_value)) + np.finfo(n_value.dtype).eps) - b_value = b_value / (np.max(np.abs(b_value)) + np.finfo(b_value.dtype).eps) - num = n_value.dot(b_value) - a_norm = np.linalg.norm(n_value) - b_norm = np.linalg.norm(b_value) - if a_norm <= np.finfo(float).eps and b_norm <= np.finfo(float).eps: - return cos, True - elif a_norm <= np.finfo(float).eps: - print_warn_log("All the data is Zero in npu dump data. Compare by relative error.") - return get_max_rel_err(n_value, b_value) - elif b_norm <= np.finfo(float).eps: - print_warn_log("All the data is Zero in bench dump data. Compare by relative error.") + msg = "All the data in npu dump data is scalar. Please refer to other compare algorithms." + return cos, True, msg + n_value_max = np.max(np.abs(n_value)) + b_value_max = np.max(np.abs(b_value)) + if n_value_max <= np.finfo(float).eps and b_value_max <= np.finfo(float).eps: + return cos, True, msg + elif n_value_max <= np.finfo(float).eps: + msg = "All the data is zero in npu dump data." + return CompareConst.NAN, False, msg + elif b_value_max <= np.finfo(float).eps: + msg = "All the data is zero in bench dump data." + return CompareConst.NAN, False, msg else: - cos = num / (a_norm * b_norm) + n_value /= n_value_max + b_value /= b_value_max + cos = np.dot(n_value, b_value) / (np.linalg.norm(n_value) * np.linalg.norm(b_value)) if np.isnan(cos): - print_warn_log("Dump data has NaN when comparing with Cosine Similarity.") - return cos, cos > 0.99 + msg = "Dump data has NaN when comparing with Cosine Similarity." + return cos, cos > 0.99, msg def compare_uint8_data(n_value, b_value): @@ -90,9 +90,11 @@ def compare_uint8_data(n_value, b_value): def compare_builtin_type(bench_out, npu_out): + if not isinstance(bench_out, (bool, int, float, str)): + return CompareConst.NA, True, f"The data is not builtin type: {type(bench_out)}" if bench_out != npu_out: - return CompareConst.NAN, False - return 1.0, True + return CompareConst.NAN, False, "" + return True, True, "" def flatten_compare_result(result): @@ -106,12 +108,13 @@ def flatten_compare_result(result): def compare_core(bench_out, npu_out, alg): + msg = "" if type(bench_out) != type(npu_out): - raise ValueError("bench and npu output type is different") + compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output type is different." if isinstance(bench_out, (list, tuple)): compare_result, test_success = [], True if len(bench_out) != len(npu_out): - raise ValueError("bench and npu output structure is different") + compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output structure is different" for b_out_i, n_out_i in zip(bench_out, npu_out): compare_result_i, test_success_i = compare_core(b_out_i, n_out_i, alg) compare_result.append(compare_result_i) @@ -119,18 +122,20 @@ def compare_core(bench_out, npu_out, alg): elif isinstance(bench_out, dict): b_keys, n_keys = set(bench_out.keys()), set(npu_out.keys()) if b_keys != n_keys: - raise ValueError("bench and npu output dictionary keys are different") + compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output dictionary keys are different" compare_result, test_success = compare_core(list(bench_out.values()), list(npu_out.values())) elif isinstance(bench_out, torch.Tensor): - compare_result, test_success = compare_torch_tensor(bench_out, npu_out, alg) + compare_result, test_success, msg = compare_torch_tensor(bench_out, npu_out, alg) elif isinstance(bench_out, (bool, int, float, str)): - compare_result, test_success = compare_builtin_type(bench_out, npu_out) + compare_result, test_success, msg = compare_builtin_type(bench_out, npu_out) elif bench_out is None: - return 1.0, True + compare_result, test_success, msg = CompareConst.NA, True, "output is None" else: - raise NotImplementedError("Unexpected output type in compare_core: {}".format(type(bench_out))) + compare_result, test_success, msg = CompareConst.NA, True, "Unexpected output type in compare_core: {}".format(type(bench_out)) if isinstance(compare_result, list): compare_result = flatten_compare_result(compare_result) + else: + compare_result = [(compare_result, test_success, msg)] return compare_result, test_success -- Gitee From 78f0c4dfa62075eb3e555086c80a0eee506cc3fc Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Mon, 7 Aug 2023 16:11:50 +0800 Subject: [PATCH 04/17] fix codecheck --- .../api_accuracy_checker/compare/algorithm.py | 11 +++++++---- .../api_accuracy_checker/compare/compare.py | 6 +++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 629dadc082d..3744cd746fb 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -3,7 +3,6 @@ import torch import numpy as np from api_accuracy_checker.compare.compare_utils import CompareConst -from api_accuracy_checker.common.utils import print_warn_log, Const def compare_torch_tensor(cpu_output, npu_output, compare_alg): if cpu_output.dtype == torch.bool or cpu_output.dtype == torch.uint8: @@ -41,16 +40,19 @@ def get_max_rel_err(n_value, b_value): rel_err = np.abs((n_value - b_value) / (b_value + np.finfo(b_value.dtype).eps)).max() bool_result = rel_err < 0.001 - return reL_err, bool_result, msg + return rel_err, bool_result, msg + def max_rel_err_standard(max_rel_errs): bool_result = np.array(max_rel_errs) < 0.001 return np.all(bool_result), bool_result + def cosine_standard(compare_result): bool_result = np.array(compare_result) > 0.99 return np.all(bool_result), bool_result + def cosine_sim(cpu_output, npu_output): msg = "" n_value = npu_output.reshape(-1) @@ -122,7 +124,7 @@ def compare_core(bench_out, npu_out, alg): elif isinstance(bench_out, dict): b_keys, n_keys = set(bench_out.keys()), set(npu_out.keys()) if b_keys != n_keys: - compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output dictionary keys are different" + compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output dict keys are different" compare_result, test_success = compare_core(list(bench_out.values()), list(npu_out.values())) elif isinstance(bench_out, torch.Tensor): compare_result, test_success, msg = compare_torch_tensor(bench_out, npu_out, alg) @@ -131,7 +133,8 @@ def compare_core(bench_out, npu_out, alg): elif bench_out is None: compare_result, test_success, msg = CompareConst.NA, True, "output is None" else: - compare_result, test_success, msg = CompareConst.NA, True, "Unexpected output type in compare_core: {}".format(type(bench_out)) + compare_result, test_success, msg = CompareConst.NA, True, "Unexpected output type \ + in compare_core: {}".format(type(bench_out)) if isinstance(compare_result, list): compare_result = flatten_compare_result(compare_result) else: diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index c3b86b07ee3..2c75e685684 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -27,7 +27,7 @@ class Comparator: self.compare_alg_names = [] self.register_compare_algorithm("Cosine Similarity", cosine_sim, cosine_standard) self.register_compare_algorithm("Max Relative Error", get_max_rel_err, None) - self.register_compare_algorithm("Default: Isequal", compare_builtin_type, None) + self.register_compare_algorithm("Default: isEqual", compare_builtin_type, None) self.test_results = [] self.test_result_cnt = {"forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0} @@ -65,8 +65,8 @@ class Comparator: def write_detail_csv(self): test_rows = [["Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", "Max Rel Err Similarity", "Max Rel Err Pass", "Max Rel Err Message", - "Default Isequal", "Default Isequal Pass", - "Default Isequal Message"]] # "Max Absolute Error", "Max Relative Error" + "Default isEqual", "Default isEqual Pass", + "Default isEqual Message"]] # "Max Absolute Error", "Max Relative Error" for test_result in self.test_results: subject_prefix = test_result[0] fwd_result = test_result[3] -- Gitee From e3eb649144aa57c624678d6aaba399d8f126efe8 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Mon, 7 Aug 2023 16:29:37 +0800 Subject: [PATCH 05/17] fix typo --- debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py | 2 +- debug/accuracy_tools/api_accuracy_checker/compare/compare.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 3744cd746fb..e2875b05804 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -9,7 +9,7 @@ def compare_torch_tensor(cpu_output, npu_output, compare_alg): if npu_output.dtype != cpu_output.dtype: return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but npu output dtype is {npu_output.dtype}, cannot compare." return compare_bool_tensor(cpu_output, npu_output) - return compare_alg(cpu_output, npu_output) + return compare_alg(cpu_output.detach().numpy(), npu_output.detach().numpy()) def compare_bool_tensor(cpu_output, npu_output): diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 2c75e685684..4506c454cd1 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -18,7 +18,7 @@ class Comparator: def __init__(self, result_save_path, stack_info_json_path=None): self.save_path = os.path.join(result_save_path, self.TEST_FILE_NAME) - self.detail_save_path = os.path.join(result_save_path, self.TEST_FILE_NAME) + self.detail_save_path = os.path.join(result_save_path, self.DETAIL_TEST_FILE_NAME) if stack_info_json_path: self.stack_info = get_json_contents(stack_info_json_path) else: @@ -112,6 +112,7 @@ class Comparator: for name in self.compare_alg_names: alg = self.compare_alg[name][0] detailed_result, test_success = compare_core(bench_out, npu_out, alg) + test_success_total = test_success_total and test_success if detailed_result_total: for i in range(len(detailed_result_total)): detailed_result_total[i] += detailed_result[i] -- Gitee From d55612f4b33fbe507adc44adb24cf1f5e0023c35 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 10:52:11 +0800 Subject: [PATCH 06/17] =?UTF-8?q?readme=20=E6=9B=B4=E6=96=B0=EF=BC=8Ccompa?= =?UTF-8?q?re=20=E8=AF=A6=E7=BB=86=E7=BB=93=E6=9E=9C=E5=B1=95=E7=A4=BAbug?= =?UTF-8?q?=20fix=EF=BC=8Capi=20info=E9=BB=98=E8=AE=A4=E5=AD=98=E7=9B=98?= =?UTF-8?q?=E8=B7=AF=E5=BE=84=E6=94=B9=E4=B8=BA./?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...77\347\224\250\346\226\271\346\263\225.md" | 20 ++++++++++++++++--- .../api_accuracy_checker/compare/algorithm.py | 18 ++++++++++++++--- .../api_accuracy_checker/compare/compare.py | 2 +- .../api_accuracy_checker/config.yaml | 2 +- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git "a/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" "b/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" index 486a319b6b4..0d6a67a4a2e 100644 --- "a/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" +++ "b/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" @@ -1,8 +1,15 @@ # Ascend模型精度预检工具 +模型精度预检工具会提取模型中所有的API前反向的信息,构造相应的API单元测试,将NPU输出与标杆比对,从而检测出精度有问题的API。 + +## 工具优势 +1. 落盘数据小 +2. 不依赖标杆侧GPU训练资源,本地即可完成预检 +3. 支持随机生成模式和真实数据模式 +4. 单API测试,排除整网中的累计误差问题 ## 使用方式 -1. 安装遇见工具 +1. 安装预检工具 将att仓代码下载到本地,并配置环境变量。假设att仓本地路径为 {att_root},环境变量应配置为 @@ -17,7 +24,13 @@ set_dump_switch("ON") ``` -​ dump信息默认会存盘到./api_info/路径下,后缀的数字代表进程pid +​ dump信息默认会存盘到./路径下,包括前向API信息forward_info_{pid}.json, 反向API信息backward_info_{pid}.json, 调用栈信息stack_info_{pid}.json。真实数据模式下还有forward_real_data和backward_real_data文件夹,里面有每个api输入的具体数值。forward_info与stack_info中的key值一一对应,用户可根据forward_info中API的key在stack_info中查询到其调用栈及代码行位置。 + + 有需要的话,用户可以通过msCheckerConfig.update_config来配置dump路径以及启用真实数据模式(默认为关)。注意启用真实数据模式会存盘较多数据,可能对磁盘空间有较大冲击。 + ``` + from api_accuracy_checker.dump import msCheckerConfig + msCheckerConfig.update_config(dump_path="my/dump/path", real_data=True) # my/dump/path需配置为用户想要的api信息存盘路径,并且需要提前创建好 + ``` 3. 将上述信息输入给run_ut模块运行精度检测并比对 @@ -26,5 +39,6 @@ python run_ut.py --forward ./api_info/forward_info_0.json --backward ./api_info/backward_info_0.json ``` - forward和backward两个命令行参数根据实际情况配置。比对结果存盘位置会打屏显示,默认是'./',可以在运行run_ut.py时通过 --out_path命令行参数配置。 + forward和backward两个命令行参数根据实际情况配置。比对结果存盘位置会打屏显示,默认是'./',可以在运行run_ut.py时通过 --out_path命令行参数配置。结果包括pretest_result.csv和pretest_details.csv两个文件。前者是api粒度的,标明每个api是否通过测试。建议用户先查看前者,对于其中没有通过测试的或者特定感兴趣的api,根据其API name字段在pretest_details.csv中查询其各个输出的达标情况。 + diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index e2875b05804..3e4f3349f1f 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -37,7 +37,19 @@ def get_max_rel_err(n_value, b_value): if n_value.dtype != b_value.dtype: msg = f"Dtype of npu and bench outputs don't match. NPU: {n_value.dtype}, bench: {b_value.dtype}." - rel_err = np.abs((n_value - b_value) / (b_value + np.finfo(b_value.dtype).eps)).max() + if b_value.dtype in Const.FLOAT_TYPE: + zero_mask = (b_value == 0) + # 给0的地方加上eps防止除0 + b_value[zero_mask] += np.finfo(b_value.dtype).eps + # 根据b_value为0的位置给n_value也加上eps,否则两者都是0的情况下相对误差会是1 + n_value[zero_mask] += np.finfo(b_value.dtype).eps + else: + # int type + float eps 会报错,所以这里要强转 + n_value, b_value = n_value.astype(float), b_value.astype(float) + zero_mask = (b_value == 0) + b_value[zero_mask] += np.finfo(float).eps + n_value[zero_mask] += np.finfo(float).eps + rel_err = np.abs((n_value - b_value) / b_value).max() bool_result = rel_err < 0.001 return rel_err, bool_result, msg @@ -76,8 +88,8 @@ def cosine_sim(cpu_output, npu_output): msg = "All the data is zero in bench dump data." return CompareConst.NAN, False, msg else: - n_value /= n_value_max - b_value /= b_value_max + n_value = n_value_max.astype(float) / n_value_max + b_value = b_value_max.astype(float) / b_value_max cos = np.dot(n_value, b_value) / (np.linalg.norm(n_value) * np.linalg.norm(b_value)) if np.isnan(cos): msg = "Dump data has NaN when comparing with Cosine Similarity." diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 4506c454cd1..2565cbbeed9 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -64,7 +64,7 @@ class Comparator: def write_detail_csv(self): test_rows = [["Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", - "Max Rel Err Similarity", "Max Rel Err Pass", "Max Rel Err Message", + "Max Rel Error", "Max Rel Err Pass", "Max Rel Err Message", "Default isEqual", "Default isEqual Pass", "Default isEqual Message"]] # "Max Absolute Error", "Max Relative Error" for test_result in self.test_results: diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml index 38a1a3c47b1..7e2cd46fe24 100644 --- a/debug/accuracy_tools/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml @@ -1,4 +1,4 @@ -dump_path: './api_info' +dump_path: './' jit_compile: True compile_option: -O3 compare_algorithm: cosine_similarity -- Gitee From 4abbd29528e5c9b03a03424dc5e579ac4ea4a34b Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 11:13:13 +0800 Subject: [PATCH 07/17] remove print, convert bool to string when save csv, fix import error --- .../api_accuracy_checker/compare/algorithm.py | 7 ++++--- .../accuracy_tools/api_accuracy_checker/compare/compare.py | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 3e4f3349f1f..5531fa7d1c1 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -3,13 +3,14 @@ import torch import numpy as np from api_accuracy_checker.compare.compare_utils import CompareConst +from api_accuracy_checker.common.utils import Const def compare_torch_tensor(cpu_output, npu_output, compare_alg): if cpu_output.dtype == torch.bool or cpu_output.dtype == torch.uint8: if npu_output.dtype != cpu_output.dtype: return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but npu output dtype is {npu_output.dtype}, cannot compare." return compare_bool_tensor(cpu_output, npu_output) - return compare_alg(cpu_output.detach().numpy(), npu_output.detach().numpy()) + return compare_alg(cpu_output.detach().numpy(), npu_output.detach().cpu().numpy()) def compare_bool_tensor(cpu_output, npu_output): @@ -120,7 +121,7 @@ def flatten_compare_result(result): flatten_result.append(result_i) return flatten_result - +# 本函数 def compare_core(bench_out, npu_out, alg): msg = "" if type(bench_out) != type(npu_out): @@ -150,7 +151,7 @@ def compare_core(bench_out, npu_out, alg): if isinstance(compare_result, list): compare_result = flatten_compare_result(compare_result) else: - compare_result = [(compare_result, test_success, msg)] + compare_result = [(compare_result, str(test_success), msg)] return compare_result, test_success diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 2565cbbeed9..3c7b264c5bf 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -71,14 +71,13 @@ class Comparator: subject_prefix = test_result[0] fwd_result = test_result[3] bwd_result = test_result[4] - print(fwd_result, bwd_result) if isinstance(fwd_result, list): for i, test_subject in enumerate(fwd_result): - subject = subject_prefix + ".forward.output" + str(i) + subject = subject_prefix + ".forward.output." + str(i) test_rows.append([subject] + list(test_subject)) if isinstance(bwd_result, list): for i, test_subject in enumerate(bwd_result): - subject = subject_prefix + ".backward.output" + str(i) + subject = subject_prefix + ".backward.output." + str(i) test_rows.append([subject] + list(test_subject)) write_csv(test_rows, self.detail_save_path) -- Gitee From 0a7ca4caab43f69379031583881888cdaf77feb6 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 11:31:27 +0800 Subject: [PATCH 08/17] fix codecheck issues --- .../api_accuracy_checker/compare/algorithm.py | 11 ++++++----- .../api_accuracy_checker/compare/compare.py | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 5531fa7d1c1..688565f42e1 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -8,7 +8,8 @@ from api_accuracy_checker.common.utils import Const def compare_torch_tensor(cpu_output, npu_output, compare_alg): if cpu_output.dtype == torch.bool or cpu_output.dtype == torch.uint8: if npu_output.dtype != cpu_output.dtype: - return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but npu output dtype is {npu_output.dtype}, cannot compare." + return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but\ + npu output dtype is {npu_output.dtype}, cannot compare." return compare_bool_tensor(cpu_output, npu_output) return compare_alg(cpu_output.detach().numpy(), npu_output.detach().cpu().numpy()) @@ -39,7 +40,7 @@ def get_max_rel_err(n_value, b_value): msg = f"Dtype of npu and bench outputs don't match. NPU: {n_value.dtype}, bench: {b_value.dtype}." if b_value.dtype in Const.FLOAT_TYPE: - zero_mask = (b_value == 0) + zero_mask = (b_value == 0) # 给0的地方加上eps防止除0 b_value[zero_mask] += np.finfo(b_value.dtype).eps # 根据b_value为0的位置给n_value也加上eps,否则两者都是0的情况下相对误差会是1 @@ -47,7 +48,7 @@ def get_max_rel_err(n_value, b_value): else: # int type + float eps 会报错,所以这里要强转 n_value, b_value = n_value.astype(float), b_value.astype(float) - zero_mask = (b_value == 0) + zero_mask = (b_value == 0) b_value[zero_mask] += np.finfo(float).eps n_value[zero_mask] += np.finfo(float).eps rel_err = np.abs((n_value - b_value) / b_value).max() @@ -74,7 +75,7 @@ def cosine_sim(cpu_output, npu_output): np.seterr(divide="ignore", invalid="ignore") if n_value.shape != b_value.shape: msg = f"Shape of npu and bench outputs don't match. NPU: {n_value.shape}, bench: {b_value.shape}." - return -1, False, msg + return -1, False, msg if len(n_value) == 1: msg = "All the data in npu dump data is scalar. Please refer to other compare algorithms." return cos, True, msg @@ -124,7 +125,7 @@ def flatten_compare_result(result): # 本函数 def compare_core(bench_out, npu_out, alg): msg = "" - if type(bench_out) != type(npu_out): + if not isinstance(bench_out, type(npu_out)): compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output type is different." if isinstance(bench_out, (list, tuple)): compare_result, test_success = [], True diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 3c7b264c5bf..88aac980cb1 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -24,13 +24,13 @@ class Comparator: else: self.stack_info = None self.compare_alg = {} - self.compare_alg_names = [] self.register_compare_algorithm("Cosine Similarity", cosine_sim, cosine_standard) self.register_compare_algorithm("Max Relative Error", get_max_rel_err, None) self.register_compare_algorithm("Default: isEqual", compare_builtin_type, None) self.test_results = [] - self.test_result_cnt = {"forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, - "success_num": 0} + self.test_result_cnt = { + "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0 + } def print_pretest_result(self): res_dict = { @@ -63,10 +63,12 @@ class Comparator: write_csv(test_rows, self.save_path) def write_detail_csv(self): - test_rows = [["Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", - "Max Rel Error", "Max Rel Err Pass", "Max Rel Err Message", - "Default isEqual", "Default isEqual Pass", - "Default isEqual Message"]] # "Max Absolute Error", "Max Relative Error" + test_rows = [[ + "Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", + "Max Rel Error", "Max Rel Err Pass", "Max Rel Err Message", + "Default isEqual", "Default isEqual Pass", + "Default isEqual Message" + ]] for test_result in self.test_results: subject_prefix = test_result[0] fwd_result = test_result[3] @@ -87,7 +89,6 @@ class Comparator: def register_compare_algorithm(self, name, compare_func, standard): self.compare_alg.update({name: (compare_func, standard)}) - self.compare_alg_names.append(name) def compare_output(self, api_name, bench_out, npu_out, bench_grad=None, npu_grad=None): is_fwd_success, fwd_compare_alg_results = self._compare_core_wrapper(bench_out, npu_out) @@ -108,7 +109,7 @@ class Comparator: def _compare_core_wrapper(self, bench_out, npu_out): detailed_result_total = [] test_success_total = True - for name in self.compare_alg_names: + for name in self.compare_alg.keys(): alg = self.compare_alg[name][0] detailed_result, test_success = compare_core(bench_out, npu_out, alg) test_success_total = test_success_total and test_success -- Gitee From a210277c284afc06ad6a13b3437ca393a69a99da Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 11:52:36 +0800 Subject: [PATCH 09/17] =?UTF-8?q?=E6=8F=90=E5=8F=96dtype=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/utils.py | 2 ++ .../api_accuracy_checker/compare/algorithm.py | 16 ++++++++-------- .../compare/compare_utils.py | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 988bcaa8f04..8e1ea2e03b6 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -63,6 +63,8 @@ class Const: BACKWARD = 'backward' FORWARD = 'forward' FLOAT_TYPE = [np.half, np.single, np.double, np.float64, np.longdouble] + BOOL_TYPE = [np.bool, np.uint8] + INT_TYPE = [np.int32, np.int64] # dump mode ALL = "all" diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 688565f42e1..e950f9b5f8f 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -2,16 +2,16 @@ import torch import numpy as np -from api_accuracy_checker.compare.compare_utils import CompareConst +from api_accuracy_checker.compare.compare_utils import CompareConst, check_dtype from api_accuracy_checker.common.utils import Const def compare_torch_tensor(cpu_output, npu_output, compare_alg): - if cpu_output.dtype == torch.bool or cpu_output.dtype == torch.uint8: - if npu_output.dtype != cpu_output.dtype: - return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but\ + if not check_dtype(cpu_output, npu_output): + return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but\ npu output dtype is {npu_output.dtype}, cannot compare." + if cpu_output.dtype == np.bool or cpu_output.dtype == np.uint8: return compare_bool_tensor(cpu_output, npu_output) - return compare_alg(cpu_output.detach().numpy(), npu_output.detach().cpu().numpy()) + return compare_alg(cpu_output, npu_output) def compare_bool_tensor(cpu_output, npu_output): @@ -20,8 +20,8 @@ def compare_bool_tensor(cpu_output, npu_output): npu_shape = npu_output.shape if cpu_shape != npu_shape: return error_rate, False, "" - npu_data = npu_output.cpu().detach().numpy() - bench_data = cpu_output.detach().numpy() + npu_data = npu_output + bench_data = cpu_output data_size = bench_data.size error_nums = (bench_data != npu_data).sum() error_rate = float(error_nums / data_size) @@ -141,7 +141,7 @@ def compare_core(bench_out, npu_out, alg): compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output dict keys are different" compare_result, test_success = compare_core(list(bench_out.values()), list(npu_out.values())) elif isinstance(bench_out, torch.Tensor): - compare_result, test_success, msg = compare_torch_tensor(bench_out, npu_out, alg) + compare_result, test_success, msg = compare_torch_tensor(bench_out.detach().numpy(), npu_out.detach().cpu().numpy(), alg) elif isinstance(bench_out, (bool, int, float, str)): compare_result, test_success, msg = compare_builtin_type(bench_out, npu_out) elif bench_out is None: diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py index 2f4d7eb38cc..30e65491ed3 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py @@ -1,4 +1,21 @@ +from api_accuracy_checker.common.utils import Const, print_warn_log import numpy as np class CompareConst: NAN = np.nan NA = "N/A" + +def check_dtype_comparable(x, y): + if x.dtype in Const.FLOAT_TYPE: + if y.dtype in Const.FLOAT_TYPE: + return True + return False + if x.dtype in Const.BOOL_TYPE: + if y.dtype in Const.BOOL_TYPE: + return True + return False + if x.dtype in Const.INT_TYPE: + if y.dtype in CONST.INT_TYPE: + return True + return False + print_warn_log(f"Compare: Unexpected dtype {x.dtype}, {y.dtype}") + return False \ No newline at end of file -- Gitee From 9ead018cdd8a356a129e04b06a555800499ad946 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 11:54:42 +0800 Subject: [PATCH 10/17] =?UTF-8?q?=E6=8F=90=E5=8F=96dtype=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/compare/compare_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py index 30e65491ed3..59b7030111d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py @@ -1,9 +1,12 @@ from api_accuracy_checker.common.utils import Const, print_warn_log import numpy as np + + class CompareConst: NAN = np.nan NA = "N/A" + def check_dtype_comparable(x, y): if x.dtype in Const.FLOAT_TYPE: if y.dtype in Const.FLOAT_TYPE: -- Gitee From 2ac836e8181817cd3075ef56b68b61982aef1586 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 11:56:01 +0800 Subject: [PATCH 11/17] =?UTF-8?q?=E6=8F=90=E5=8F=96dtype=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../accuracy_tools/api_accuracy_checker/compare/algorithm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index e950f9b5f8f..3bff4246714 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -2,11 +2,12 @@ import torch import numpy as np -from api_accuracy_checker.compare.compare_utils import CompareConst, check_dtype +from api_accuracy_checker.compare.compare_utils import CompareConst, check_dtype_comparable from api_accuracy_checker.common.utils import Const + def compare_torch_tensor(cpu_output, npu_output, compare_alg): - if not check_dtype(cpu_output, npu_output): + if not check_dtype_comparable(cpu_output, npu_output): return CompareConst.NAN, False, f"Bench out dtype is {cpu_output.dtype} but\ npu output dtype is {npu_output.dtype}, cannot compare." if cpu_output.dtype == np.bool or cpu_output.dtype == np.uint8: -- Gitee From 441b3fed424aa90eda1ed11e845a15c15c5e3512 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 11:58:18 +0800 Subject: [PATCH 12/17] =?UTF-8?q?=E6=8F=90=E5=8F=96dtype=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/utils.py | 4 ++-- .../api_accuracy_checker/compare/compare.py | 12 ++++++------ .../api_accuracy_checker/compare/compare_utils.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 8e1ea2e03b6..79c1aa368b9 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -62,8 +62,8 @@ class Const: OFF = 'OFF' BACKWARD = 'backward' FORWARD = 'forward' - FLOAT_TYPE = [np.half, np.single, np.double, np.float64, np.longdouble] - BOOL_TYPE = [np.bool, np.uint8] + FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble] + BOOL_TYPE = [bool, np.uint8] INT_TYPE = [np.int32, np.int64] # dump mode diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 88aac980cb1..fba0df189e8 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -30,7 +30,7 @@ class Comparator: self.test_results = [] self.test_result_cnt = { "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0 - } + } def print_pretest_result(self): res_dict = { @@ -64,11 +64,11 @@ class Comparator: def write_detail_csv(self): test_rows = [[ - "Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", - "Max Rel Error", "Max Rel Err Pass", "Max Rel Err Message", - "Default isEqual", "Default isEqual Pass", - "Default isEqual Message" - ]] + "Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", + "Max Rel Error", "Max Rel Err Pass", "Max Rel Err Message", + "Default isEqual", "Default isEqual Pass", + "Default isEqual Message" + ]] for test_result in self.test_results: subject_prefix = test_result[0] fwd_result = test_result[3] diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py index 59b7030111d..62044f58521 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py @@ -17,7 +17,7 @@ def check_dtype_comparable(x, y): return True return False if x.dtype in Const.INT_TYPE: - if y.dtype in CONST.INT_TYPE: + if y.dtype in Const.INT_TYPE: return True return False print_warn_log(f"Compare: Unexpected dtype {x.dtype}, {y.dtype}") -- Gitee From 2aeab1149ba17e4f515ae5a13c5131d534cd81a5 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 12:01:37 +0800 Subject: [PATCH 13/17] =?UTF-8?q?=E6=8F=90=E5=8F=96dtype=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 3bff4246714..d5836fb25e1 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -123,7 +123,7 @@ def flatten_compare_result(result): flatten_result.append(result_i) return flatten_result -# 本函数 +# 本函数用alg比对bench_out 和npu_out,返回详细比对结果compare_result和标志比对是否通过的布尔变量test_success def compare_core(bench_out, npu_out, alg): msg = "" if not isinstance(bench_out, type(npu_out)): -- Gitee From 5e745d4348767d37708b80ac76b66b484d42ae43 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 14:27:16 +0800 Subject: [PATCH 14/17] edit readme --- ...\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" "b/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" index 0d6a67a4a2e..832e6a32d89 100644 --- "a/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" +++ "b/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" @@ -1,7 +1,7 @@ # Ascend模型精度预检工具 模型精度预检工具会提取模型中所有的API前反向的信息,构造相应的API单元测试,将NPU输出与标杆比对,从而检测出精度有问题的API。 -## 工具优势 +## 工具特性 1. 落盘数据小 2. 不依赖标杆侧GPU训练资源,本地即可完成预检 3. 支持随机生成模式和真实数据模式 -- Gitee From 1693e6043da867df93e1d6e8a1b9039ffc3f91c3 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 16:43:03 +0800 Subject: [PATCH 15/17] =?UTF-8?q?compare=20=E6=A0=B9=E6=8D=AE=E6=A3=80?= =?UTF-8?q?=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...77\347\224\250\346\226\271\346\263\225.md" | 20 +++++++++++++------ .../api_accuracy_checker/compare/algorithm.py | 6 +++--- .../api_accuracy_checker/compare/compare.py | 19 ++++++++++++++++-- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git "a/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" "b/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" index 832e6a32d89..74e6ff59ac1 100644 --- "a/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" +++ "b/debug/accuracy_tools/api_accuracy_checker/Ascend\346\250\241\345\236\213\347\262\276\345\272\246\351\242\204\346\243\200\345\267\245\345\205\267\344\275\277\347\224\250\346\226\271\346\263\225.md" @@ -17,28 +17,36 @@ export PYTHONPATH=$PYTHONPATH:{att_root}/debug/accuracy_tools/ ``` -2. 使用工具dump模块抓取网络所有API信息 +2. 在工具中加入以下代码使用工具dump模块,启动训练抓取网络所有API信息,目前工具仅支持抓取训练的第一个迭代并且在第一个迭代后会退出训练进程。 ``` from api_accuracy_checker.dump import set_dump_switch - set_dump_switch("ON") ``` ​ dump信息默认会存盘到./路径下,包括前向API信息forward_info_{pid}.json, 反向API信息backward_info_{pid}.json, 调用栈信息stack_info_{pid}.json。真实数据模式下还有forward_real_data和backward_real_data文件夹,里面有每个api输入的具体数值。forward_info与stack_info中的key值一一对应,用户可根据forward_info中API的key在stack_info中查询到其调用栈及代码行位置。 - 有需要的话,用户可以通过msCheckerConfig.update_config来配置dump路径以及启用真实数据模式(默认为关)。注意启用真实数据模式会存盘较多数据,可能对磁盘空间有较大冲击。 + 有需要的话,用户可以通过msCheckerConfig.update_config来配置dump路径以及启用真实数据模式(默认为关)。注意启用真实数据模式目前仅支持单卡,且会存盘较多数据,可能对磁盘空间有较大冲击。 ``` from api_accuracy_checker.dump import msCheckerConfig msCheckerConfig.update_config(dump_path="my/dump/path", real_data=True) # my/dump/path需配置为用户想要的api信息存盘路径,并且需要提前创建好 ``` -3. 将上述信息输入给run_ut模块运行精度检测并比对 +3. 将上述信息输入给run_ut模块运行精度检测并比对,运行如下命令: ``` cd run_ut - python run_ut.py --forward ./api_info/forward_info_0.json --backward ./api_info/backward_info_0.json + python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json ``` - forward和backward两个命令行参数根据实际情况配置。比对结果存盘位置会打屏显示,默认是'./',可以在运行run_ut.py时通过 --out_path命令行参数配置。结果包括pretest_result.csv和pretest_details.csv两个文件。前者是api粒度的,标明每个api是否通过测试。建议用户先查看前者,对于其中没有通过测试的或者特定感兴趣的api,根据其API name字段在pretest_details.csv中查询其各个输出的达标情况。 + forward和backward两个命令行参数根据实际存盘的json文件名配置。比对结果存盘路径默认是'./',可以在运行run_ut.py时通过 --out_path命令行参数配置。结果包括pretest_result.csv和pretest_details.csv两个文件。前者是api粒度的,标明每个api是否通过测试。建议用户先查看前者,对于其中没有通过测试的或者特定感兴趣的api,根据其API name字段在pretest_details.csv中查询其各个输出的达标情况以及比较指标。 + + 注意:目前API通过测试的标准是每个输出与标杆比对的余弦相似度大于0.99,pretest_details.csv中的相对误差供用户分析时使用。 + + + + + + + diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index d5836fb25e1..58e05cf749a 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -108,7 +108,7 @@ def compare_uint8_data(n_value, b_value): def compare_builtin_type(bench_out, npu_out): if not isinstance(bench_out, (bool, int, float, str)): - return CompareConst.NA, True, f"The data is not builtin type: {type(bench_out)}" + return CompareConst.NA, True, "" if bench_out != npu_out: return CompareConst.NAN, False, "" return True, True, "" @@ -127,11 +127,11 @@ def flatten_compare_result(result): def compare_core(bench_out, npu_out, alg): msg = "" if not isinstance(bench_out, type(npu_out)): - compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output type is different." + return CompareConst.NAN, False, "bench and npu output type is different." if isinstance(bench_out, (list, tuple)): compare_result, test_success = [], True if len(bench_out) != len(npu_out): - compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output structure is different" + return CompareConst.NAN, False, "bench and npu output structure is different" for b_out_i, n_out_i in zip(bench_out, npu_out): compare_result_i, test_success_i = compare_core(b_out_i, n_out_i, alg) compare_result.append(compare_result_i) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index fba0df189e8..58352d592ef 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -91,7 +91,10 @@ class Comparator: self.compare_alg.update({name: (compare_func, standard)}) def compare_output(self, api_name, bench_out, npu_out, bench_grad=None, npu_grad=None): - is_fwd_success, fwd_compare_alg_results = self._compare_core_wrapper(bench_out, npu_out) + if "dropout" in api_name: + is_fwd_success, fwd_compare_alg_results = self._compare_dropout(bench_out, npu_out) + else: + is_fwd_success, fwd_compare_alg_results = self._compare_core_wrapper(bench_out, npu_out) if bench_grad and npu_grad: is_bwd_success, bwd_compare_alg_results = self._compare_core_wrapper(bench_grad, npu_grad) else: @@ -112,10 +115,22 @@ class Comparator: for name in self.compare_alg.keys(): alg = self.compare_alg[name][0] detailed_result, test_success = compare_core(bench_out, npu_out, alg) - test_success_total = test_success_total and test_success + if name != "Max Relative Error": + test_success_total = test_success_total and test_success if detailed_result_total: for i in range(len(detailed_result_total)): detailed_result_total[i] += detailed_result[i] else: detailed_result_total = detailed_result return test_success_total, detailed_result_total + + @staticmethod + def _compare_dropout(bench_out, npu_out): + tensor_num = bench_out.numel() + if tensor_num >= 100: + if abs((bench_out == 0).sum() - (npu_out == 0).cpu().sum()) / tensor_num < 0.1: + return True, 1 + else: + return False, 0 + else: + return True, 1 -- Gitee From 1091d3717902e9a42478adb04af74fe1668183c8 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 16:57:22 +0800 Subject: [PATCH 16/17] change compare alg name --- debug/accuracy_tools/api_accuracy_checker/compare/compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 58352d592ef..4e4abf7040c 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -66,8 +66,8 @@ class Comparator: test_rows = [[ "Subject", "Cosine Similarity", "Cosine Similarity Pass", "Cosine Similarity Message", "Max Rel Error", "Max Rel Err Pass", "Max Rel Err Message", - "Default isEqual", "Default isEqual Pass", - "Default isEqual Message" + "Compare Builtin Type", "Builtin Type Pass", + "Builtin Type Message" ]] for test_result in self.test_results: subject_prefix = test_result[0] -- Gitee From c6f791269f44292ee87d74a2aafd8cc2bbf667d0 Mon Sep 17 00:00:00 2001 From: litian_drinksnow Date: Tue, 8 Aug 2023 17:46:33 +0800 Subject: [PATCH 17/17] =?UTF-8?q?add=20dropout=20=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/compare/compare.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 4e4abf7040c..7a1c069e2ef 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -96,7 +96,10 @@ class Comparator: else: is_fwd_success, fwd_compare_alg_results = self._compare_core_wrapper(bench_out, npu_out) if bench_grad and npu_grad: - is_bwd_success, bwd_compare_alg_results = self._compare_core_wrapper(bench_grad, npu_grad) + if "dropout" in api_name: + is_bwd_success, bwd_compare_alg_results = self._compare_dropout(bench_grad[0], npu_grad[0]) + else: + is_bwd_success, bwd_compare_alg_results = self._compare_core_wrapper(bench_grad, npu_grad) else: is_bwd_success, bwd_compare_alg_results = CompareConst.NA, None self.record_results(api_name, is_fwd_success, is_bwd_success, fwd_compare_alg_results, bwd_compare_alg_results) -- Gitee