diff --git a/debug/accuracy_tools/api_accuracy_checker/README.md b/debug/accuracy_tools/api_accuracy_checker/README.md index ddf89654f2bea0509fd81cb80fdee4c964d01ba7..f6af27106bd7362827fd38231ef78c0baf94872d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/README.md +++ b/debug/accuracy_tools/api_accuracy_checker/README.md @@ -92,15 +92,23 @@ Ascend模型精度预检工具能在昇腾NPU上扫描用户训练模型中所 | -o或--out_path | 指指定run_ut执行结果存盘路径,默认“./”(相对于run_ut的路径)。 | 否 | | -j或--jit_compile | 开启jit编译。 | 否 | | -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0。 | 否 | + | -csv_path或--result_csv_path | 指定本次运行中断时生成的accuracy_checking_result_{timestamp}.csv文件路径,执行run_ut中断时,若想从中断处继续执行,配置此参数即可。 | 否 | - run_ut执行结果包括accuracy_checking_result.csv和accuracy_checking_details.csv两个文件。accuracy_checking_result.csv是API粒度的,标明每个API是否通过测试。建议用户先查看accuracy_checking_result.csv文件,对于其中没有通过测试的或者特定感兴趣的API,根据其API name字段在accuracy_checking_details.csv中查询其各个输出的达标情况以及比较指标。API达标情况介绍请参考“**API预检指标**”。 + run_ut执行结果包括accuracy_checking_result_{timestamp}.csv和accuracy_checking_details_{timestamp}.csv两个文件。accuracy_checking_result_{timestamp}.csv是API粒度的,标明每个API是否通过测试。建议用户先查看accuracy_checking_result_{timestamp}.csv文件,对于其中没有通过测试的或者特定感兴趣的API,根据其API name字段在accuracy_checking_details_{timestamp}.csv中查询其各个输出的达标情况以及比较指标。API达标情况介绍请参考“**API预检指标**”。 4. 如果需要保存比对不达标的输入和输出数据,可以在run_ut执行命令结尾添加-save_error_data,例如: ```bash python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -save_error_data ``` - 数据默认会存盘到'./ut_error_data'路径下(相对于启动run_ut的路径),有需要的话,用户可以通过msCheckerConfig.update_config来配置保存路径,参数为error_data_path + 数据默认会存盘到'./ut_error_data{timestamp}'路径下(相对于启动run_ut的路径),有需要的话,用户可以通过msCheckerConfig.update_config来配置保存路径,参数为error_data_path。 + +5. 如果本次run_ut运行中断,需要从中断处继续执行,可以在run_ut执行命令结尾配置-csv_path,例如: + + ```bash + python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -csv_path ./accuracy_checking_result_20231203211324.csv + ``` + run_ut将会从中断处继续执行,执行结果将追加写入到-csv_path配置的accuracy_checking_result_20231203211324.csv以及相同时间戳后缀的accuracy_checking_details_20231203211324.csv中,若配置了-save_error_data,error_data将会保存到相同时间戳后缀的ut_error_data20231203211324文件夹中。 ## API预检白名单 @@ -110,7 +118,7 @@ support_wrap_ops.yaml文件当前记录所有PyTorch API名称,可以直接编 ## API预检指标 -API预检通过测试,则在accuracy_checking_details.csv文件中的“pass”列标记“pass”,否则标记“error”或“warning”,详细规则如下: +API预检通过测试,则在accuracy_checking_details_{timestamp}.csv文件中的“pass”列标记“pass”,否则标记“error”或“warning”,详细规则如下: 1. 余弦相似度 > 0.99:≤ 0.99为不达标,标记“error”,> 0.99达标,进行下一步; 2. 最大绝对误差 < 0.001:< 0.001达标,标记“pass”,≥ 0.001为不达标,进行下一步; @@ -118,7 +126,7 @@ API预检通过测试,则在accuracy_checking_details.csv文件中的“pass - 对于float16和bfloat16数据:双百指标不通过,标记“error”;双百指标通过,双千指标不通过,标记“warning”;双百、双千指标均通过,标记“pass”。 - 对于float32和float64数据:双千指标不通过,标记“error”;双千指标通过,双万指标不通过,标记“warning”;双千、双万指标均通过,标记“pass”。 -4. 在accuracy_checking_result.csv中以“Forward Test Success”和“Backward Test Success”字段统计该算子前向反向输出的测试结果,对于标记“pass”的算子,则在accuracy_checking_result.csv中标记“TRUE”表示测试通过,对于标记“error”或“warning”的算子,则在accuracy_checking_result.csv中标记“FALSE”表示测试不通过。由于一个算子可能有多个前向或反向的输入或输出,那么该类算子的输入或输出中必须全为“pass”,才能在accuracy_checking_result.csv中标记“TRUE”,只要有一个输入或输出标记“error”或“warning”,那么在accuracy_checking_result.csv中标记“FALSE”。 +4. 在accuracy_checking_result_{timestamp}.csv中以“Forward Test Success”和“Backward Test Success”字段统计该算子前向反向输出的测试结果,对于标记“pass”的算子,则在accuracy_checking_result_{timestamp}.csv中标记“TRUE”表示测试通过,对于标记“error”或“warning”的算子,则在accuracy_checking_result_{timestamp}.csv中标记“FALSE”表示测试不通过。由于一个算子可能有多个前向或反向的输入或输出,那么该类算子的输入或输出中必须全为“pass”,才能在accuracy_checking_result_{timestamp}.csv中标记“TRUE”,只要有一个输入或输出标记“error”或“warning”,那么在accuracy_checking_result_{timestamp}.csv中标记“FALSE”。 双百、双千、双万精度指标是指NPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差大于百分之一、千分之一、万分之一的比例占总元素个数的比例小于百分之一、千分之一、万分之一。 diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index dcbb24b1a16dae6c3095bca5a7e380310e2eeb36..b95ec8fa3e47327e24d1ecd532ad0a13b46586ee 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -1,6 +1,5 @@ # 进行比对及结果展示 import os -import time from rich.table import Table from rich.console import Console from api_accuracy_checker.compare.algorithm import compare_core @@ -10,22 +9,22 @@ from api_accuracy_checker.common.config import msCheckerConfig class Comparator: - TEST_FILE_NAME = "accuracy_checking_result_" + time.strftime("%Y%m%d%H%M%S") + ".csv" - DETAIL_TEST_FILE_NAME = "accuracy_checking_details_" + time.strftime("%Y%m%d%H%M%S") + ".csv" - # consts for result csv COLUMN_API_NAME = "API name" COLUMN_FORWARD_SUCCESS = "Forward Test Success" COLUMN_BACKWARD_SUCCESS = "Backward Test Success" COLUMN_STACK_INFO = "Traceback callstack info" - def __init__(self, result_save_path, stack_info_json_path=None): - self.save_path = os.path.join(result_save_path, self.TEST_FILE_NAME) - if os.path.exists(self.save_path): - raise ValueError(f"file {self.save_path} already exists, please remove it first or use a new dump path") - self.detail_save_path = os.path.join(result_save_path, self.DETAIL_TEST_FILE_NAME) - if os.path.exists(self.detail_save_path): - raise ValueError(f"file {self.detail_save_path} already exists, please remove it first or use a new dump path") + def __init__(self, result_csv_path, details_csv_path, is_continue_run_ut, test_result_cnt=None, stack_info_json_path=None): + self.save_path = result_csv_path + self.detail_save_path = details_csv_path + if not is_continue_run_ut: + if os.path.exists(self.save_path): + raise ValueError(f"file {self.save_path} already exists, please remove it first or use a new dump path") + if os.path.exists(self.detail_save_path): + raise ValueError( + f"file {self.detail_save_path} already exists, please remove it first or use a new dump path") + self.write_csv_title() if stack_info_json_path: self.stack_info = get_json_contents(stack_info_json_path) else: @@ -34,9 +33,7 @@ class Comparator: self.test_result_cnt = { "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0, "total_num": 0, "forward_or_backward_fail_num": 0 - } - self.result_save_path = result_save_path - self.write_csv_title() + } if not test_result_cnt else test_result_cnt def print_pretest_result(self): if self.test_result_cnt.get("total_num") != 0: diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 0c0f3305c7104e87f64d6002996ed63c342c2eb9..fb885168d662006b3fa639c2322113d6f1f6b386 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,9 +1,10 @@ import argparse import os -import copy +import csv +import re import sys import time - +from collections import namedtuple try: import torch_npu except ImportError: @@ -12,7 +13,6 @@ except ImportError: else: is_gpu = False current_device = "npu" - import yaml import torch from tqdm import tqdm @@ -25,11 +25,17 @@ from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplat from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate from api_accuracy_checker.run_ut.ut_api_info import UtAPIInfo from api_accuracy_checker.common.config import msCheckerConfig +from api_accuracy_checker.compare.compare_utils import CompareConst from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen, FileCheckConst, FileChecker, \ change_mode, check_file_suffix, check_link -ut_error_data_dir = 'ut_error_data' +current_time = time.strftime("%Y%m%d%H%M%S") +UT_ERROR_DATA_DIR = 'ut_error_data' + current_time +RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv" +DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv" +RunUTConfig = namedtuple('RunUTConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path', + 'save_error_data', 'is_continue_run_ut', 'test_result_cnt']) def exec_api(api_type, api_name, args, kwargs): @@ -104,25 +110,30 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): return cpu_args, cpu_kwargs -def run_ut(forward_file, backward_file, out_path, save_error_data): +def run_ut(config): print_info_log("start UT test") - forward_content = get_json_contents(forward_file) - backward_content = get_json_contents(backward_file) api_setting_dict = get_json_contents("torch_ut_setting.json") - compare = Comparator(out_path) - for api_full_name, api_info_dict in tqdm(forward_content.items()): + compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut, + config.test_result_cnt) + with FileOpen(config.result_csv_path, 'r') as file: + csv_reader = csv.reader(file) + next(csv_reader) + api_name_set = {row[0] for row in csv_reader} + for i, (api_full_name, api_info_dict) in enumerate(tqdm(config.forward_content.items())): + if api_full_name in api_name_set: + continue try: if msCheckerConfig.white_list: [_, api_name, _] = api_full_name.split("*") if api_name not in set(msCheckerConfig.white_list): continue - data_info = run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict) + data_info = run_torch_api(api_full_name, api_setting_dict, config.backward_content, api_info_dict) is_fwd_success, is_bwd_success = compare.compare_output(api_full_name, data_info.bench_out, data_info.device_out, data_info.bench_grad_out, data_info.device_grad_out) - if save_error_data: + if config.save_error_data: do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) except Exception as err: [_, api_name, _] = api_full_name.split("*") @@ -141,12 +152,12 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) if not is_fwd_success or not is_bwd_success: api_full_name = api_full_name.replace("*", ".") for element in data_info.in_fwd_data_list: - UtAPIInfo(api_full_name + '.forward.input', element, ut_error_data_dir) - UtAPIInfo(api_full_name + '.forward.output.bench', data_info.bench_out, ut_error_data_dir) - UtAPIInfo(api_full_name + '.forward.output.device', data_info.device_out, ut_error_data_dir) - UtAPIInfo(api_full_name + '.backward.input', data_info.grad_in, ut_error_data_dir) - UtAPIInfo(api_full_name + '.backward.output.bench', data_info.bench_grad_out, ut_error_data_dir) - UtAPIInfo(api_full_name + '.backward.output.device', data_info.device_grad_out, ut_error_data_dir) + UtAPIInfo(api_full_name + '.forward.input', element, UT_ERROR_DATA_DIR) + UtAPIInfo(api_full_name + '.forward.output.bench', data_info.bench_out, UT_ERROR_DATA_DIR) + UtAPIInfo(api_full_name + '.forward.output.device', data_info.device_out, UT_ERROR_DATA_DIR) + UtAPIInfo(api_full_name + '.backward.input', data_info.grad_in, UT_ERROR_DATA_DIR) + UtAPIInfo(api_full_name + '.backward.output.bench', data_info.bench_grad_out, UT_ERROR_DATA_DIR) + UtAPIInfo(api_full_name + '.backward.output.device', data_info.device_grad_out, UT_ERROR_DATA_DIR) def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): @@ -221,9 +232,60 @@ def initialize_save_error_data(): error_data_path_checker = FileChecker(msCheckerConfig.error_data_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE) error_data_path = error_data_path_checker.common_check() - global ut_error_data_dir - ut_error_data_dir = 'ut_error_data' + time.strftime("%Y%m%d%H%M%S") - initialize_save_path(error_data_path, ut_error_data_dir) + initialize_save_path(error_data_path, UT_ERROR_DATA_DIR) + + +def get_validated_result_csv_path(result_csv_path): + result_csv_path_checker = FileChecker(result_csv_path, FileCheckConst.FILE, ability=FileCheckConst.READ_WRITE_ABLE, + file_type=FileCheckConst.CSV_SUFFIX) + validated_result_csv_path = result_csv_path_checker.common_check() + result_csv_name = os.path.basename(validated_result_csv_path) + pattern = r"^accuracy_checking_result_\d{14}\.csv$" + if not re.match(pattern, result_csv_name): + raise ValueError("When continue run ut, please do not modify the result csv name.") + return validated_result_csv_path + + +def get_validated_details_csv_path(validated_result_csv_path): + result_csv_name = os.path.basename(validated_result_csv_path) + details_csv_name = result_csv_name.replace('result', 'details') + details_csv_path = os.path.join(os.path.dirname(validated_result_csv_path), details_csv_name) + details_csv_path_checker = FileChecker(details_csv_path, FileCheckConst.FILE, + ability=FileCheckConst.READ_WRITE_ABLE, file_type=FileCheckConst.CSV_SUFFIX) + validated_details_csv_path = details_csv_path_checker.common_check() + return validated_details_csv_path + + +def get_statistics_from_result_csv(validated_result_csv_path): + test_result_cnt = { + "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0, + "total_num": 0, "forward_or_backward_fail_num": 0 + } + with FileOpen(validated_result_csv_path, 'r') as file: + reader = csv.reader(file) + result_csv_rows = [row for row in reader] + result_csv_name = os.path.basename(validated_result_csv_path) + for item in result_csv_rows[1:]: + if not isinstance(item, list) or len(item) < 3: + raise ValueError("The number of columns in %s is incorrect" % result_csv_name) + if item[1] not in ['True', 'False', CompareConst.NA, 'SKIP'] \ + or item[2] not in ['True', 'False', CompareConst.NA, 'SKIP']: + raise ValueError("The value in the 2nd or 3rd column of %s is wrong, it must be TRUE, FALSE or N/A" + % result_csv_name) + if item[1] == 'SKIP': + continue + test_result_cnt["total_num"] += 1 + if item[1] == 'True' and item[2] in ['True', 'N/A']: + test_result_cnt['success_num'] += 1 + elif item[1] == 'False' and item[2] == 'False': + test_result_cnt['forward_and_backward_fail_num'] += 1 + elif item[1] == 'False': + test_result_cnt['forward_fail_num'] += 1 + test_result_cnt['forward_or_backward_fail_num'] += 1 + else: + test_result_cnt['backward_fail_num'] += 1 + test_result_cnt['forward_or_backward_fail_num'] += 1 + return test_result_cnt def _run_ut_parser(parser): @@ -244,18 +306,22 @@ def _run_ut_parser(parser): help=" whether to turn on jit compile", required=False) parser.add_argument("-d", "--device", dest="device_id", type=int, help=" set device id to run ut", default=0, required=False) + parser.add_argument("-csv_path", "--result_csv_path", dest="result_csv_path", default="", type=str, + help=" The path of accuracy_checking_result_{timestamp}.csv, " + "when run ut is interrupted, enter the file path to continue run ut.", + required=False) def _run_ut(): parser = argparse.ArgumentParser() _run_ut_parser(parser) - args = parser.parse_args(sys.argv[1:]) + args = parser.parse_args(sys.argv[1:]) if not is_gpu: torch.npu.set_compile_mode(jit_compile=args.jit_compile) used_device = current_device + ":" + str(args.device_id) try: if is_gpu: - torch.cuda.set_device(used_device) + torch.cuda.set_device(used_device) else: torch.npu.set_device(used_device) except Exception as error: @@ -271,9 +337,26 @@ def _run_ut(): out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE) out_path = out_path_checker.common_check() save_error_data = args.save_error_data + forward_content = get_json_contents(forward_file) + backward_content = get_json_contents(backward_file) + result_csv_path = os.path.join(out_path, RESULT_FILE_NAME) + details_csv_path = os.path.join(out_path, DETAILS_FILE_NAME) + test_result_cnt = None + if args.result_csv_path: + result_csv_path = get_validated_result_csv_path(args.result_csv_path) + details_csv_path = get_validated_details_csv_path(result_csv_path) + test_result_cnt = get_statistics_from_result_csv(result_csv_path) if save_error_data: + if args.result_csv_path: + time_info = result_csv_path.split('.')[0].split('_')[-1] + ut_error_data_dir_name = 'ut_error_data' + time_info + ut_error_data_dir_path = os.path.join(os.path.dirname(result_csv_path), ut_error_data_dir_name) + global UT_ERROR_DATA_DIR + UT_ERROR_DATA_DIR = ut_error_data_dir_path initialize_save_error_data() - run_ut(forward_file, backward_file, out_path, save_error_data) + run_ut_config = RunUTConfig(forward_content, backward_content, result_csv_path, details_csv_path, save_error_data, + args.result_csv_path, test_result_cnt) + run_ut(run_ut_config) class UtDataInfo: