From 2b1078cb50e1c0a1b9e2328105ae4e2cf0d1a62d Mon Sep 17 00:00:00 2001 From: louyujing Date: Wed, 30 Aug 2023 09:23:01 +0000 Subject: [PATCH 1/5] update debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py. Signed-off-by: louyujing --- debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 813d0cb586..4513832f67 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -74,11 +74,12 @@ def run_ut(forward_file, backward_file, out_path, save_error_data): for api_full_name, api_info_dict in tqdm(forward_content.items()): try: data_info = run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict) + new_data_info = copy.deepcopy(data_info) is_fwd_success, is_bwd_success = compare.compare_output(api_full_name, data_info.bench_out, data_info.npu_out, data_info.bench_grad_out, data_info.npu_grad_out) if save_error_data: - do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) + do_save_error_data(api_full_name, new_data_info, is_fwd_success, is_bwd_success) except Exception as err: [_, api_name, _] = api_full_name.split("*") if "not implemented for 'Half'" in str(err): -- Gitee From 65dc4285b8711e4a92c35f5827cac0dbafe71a7d Mon Sep 17 00:00:00 2001 From: louyujing Date: Thu, 31 Aug 2023 07:41:54 +0000 Subject: [PATCH 2/5] update debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py. Signed-off-by: louyujing --- debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 4513832f67..58102426b6 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,5 +1,6 @@ import argparse import os +import copy import sys import torch_npu import yaml -- Gitee From 8f5f1fe2bd73aa2dbbee0b8bae32d253da0607fb Mon Sep 17 00:00:00 2001 From: louyujing Date: Mon, 4 Sep 2023 12:46:30 +0000 Subject: [PATCH 3/5] update debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py. Signed-off-by: louyujing --- .../accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 58102426b6..efa01c8e60 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -75,7 +75,14 @@ def run_ut(forward_file, backward_file, out_path, save_error_data): for api_full_name, api_info_dict in tqdm(forward_content.items()): try: data_info = run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict) - new_data_info = copy.deepcopy(data_info) + new_data_info = UtDataInfo( + bench_grad_out=None if data_info.bench_grad_out is None else data_info.bench_grad_out.clone(), + npu_grad_out=None if data_info.npu_grad_out is None else data_info.npu_grad_out.clone(), + npu_out=None if data_info.npu_out is None else data_info.npu_out.clone(), + bench_out=None if data_info.bench_out is None else data_info.bench_out.clone(), + grad_in=data_info.grad_in, + in_fwd_data_list=data_info.in_fwd_data_list + ) is_fwd_success, is_bwd_success = compare.compare_output(api_full_name, data_info.bench_out, data_info.npu_out, data_info.bench_grad_out, data_info.npu_grad_out) -- Gitee From d040d615813b1aca837fd1192adc8cdf0a6e5051 Mon Sep 17 00:00:00 2001 From: louyujing Date: Mon, 4 Sep 2023 12:53:20 +0000 Subject: [PATCH 4/5] update debug/accuracy_tools/api_accuracy_checker/README.md. Signed-off-by: louyujing --- debug/accuracy_tools/api_accuracy_checker/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/debug/accuracy_tools/api_accuracy_checker/README.md b/debug/accuracy_tools/api_accuracy_checker/README.md index bfd1922a63..a5bc92c093 100644 --- a/debug/accuracy_tools/api_accuracy_checker/README.md +++ b/debug/accuracy_tools/api_accuracy_checker/README.md @@ -53,6 +53,12 @@ Ascend模型精度预检工具能在昇腾NPU上扫描用户训练模型中所 注意:目前API通过测试的标准是每个输出与标杆比对的余弦相似度大于0.99,并且float16数据要通过双千分之一标准,float32数据要通过双万分之一标准,pretest_details.csv中的相对误差供用户分析时使用。 +4. 如果需要保存比对不达标的输入和输出数据,可以在run_ut执行命令结尾添加-save_error_data,例如: + + ``` + python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -save_error_data + ``` + 数据默认会存盘到'./ut_error_data'路径下(相对于启动run_ut的路径),有需要的话,用户可以通过msCheckerConfig.update_config来配置保存路径,参数为error_data_path ## FAQ 1. 多卡训练dump结果只有一组json,这正确吗? -- Gitee From ae1d5614e215b7dfe2bd9a0f0f5552ccf14b5e27 Mon Sep 17 00:00:00 2001 From: louyujing Date: Tue, 5 Sep 2023 12:01:32 +0000 Subject: [PATCH 5/5] update debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py. Signed-off-by: louyujing --- .../api_accuracy_checker/run_ut/run_ut.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index efa01c8e60..0d84ae54c7 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -75,19 +75,14 @@ def run_ut(forward_file, backward_file, out_path, save_error_data): for api_full_name, api_info_dict in tqdm(forward_content.items()): try: data_info = run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict) - new_data_info = UtDataInfo( - bench_grad_out=None if data_info.bench_grad_out is None else data_info.bench_grad_out.clone(), - npu_grad_out=None if data_info.npu_grad_out is None else data_info.npu_grad_out.clone(), - npu_out=None if data_info.npu_out is None else data_info.npu_out.clone(), - bench_out=None if data_info.bench_out is None else data_info.bench_out.clone(), - grad_in=data_info.grad_in, - in_fwd_data_list=data_info.in_fwd_data_list - ) - is_fwd_success, is_bwd_success = compare.compare_output(api_full_name, data_info.bench_out, - data_info.npu_out, data_info.bench_grad_out, - data_info.npu_grad_out) + is_fwd_success, is_bwd_success = \ + compare.compare_output(api_full_name, + None if data_info.bench_out is None else data_info.bench_out.clone(), + None if data_info.npu_out is None else data_info.npu_out.clone(), + None if data_info.bench_grad_out is None else data_info.bench_grad_out.clone(), + None if data_info.npu_grad_out is None else data_info.npu_grad_out.clone()) if save_error_data: - do_save_error_data(api_full_name, new_data_info, is_fwd_success, is_bwd_success) + do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) except Exception as err: [_, api_name, _] = api_full_name.split("*") if "not implemented for 'Half'" in str(err): -- Gitee