diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index e5a6b711004f4b2016cd30d28cdd3e4e15ac93ec..e0c893c06bf9b86e67c45a2347a0b55aaa1389e3 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -630,3 +630,22 @@ def write_pt(file_path, tensor): full_path = os.path.realpath(file_path) file_check_util.change_mode(full_path, FileCheckConst.DATA_FILE_AUTHORITY) return full_path + + +def get_real_data_path(file_path): + targets = ['forward_real_data', 'backward_real_data', 'ut_error_data\d+'] + pattern = re.compile(r'({})'.format('|'.join(targets))) + match = pattern.search(file_path) + if match: + target_index = match.start() + target_path = file_path[target_index:] + return target_path + else: + raise DumpException(DumpException.INVALID_PATH_ERROR) + + +def get_full_data_path(data_path, real_data_path): + if not data_path: + return data_path + full_data_path = os.path.join(real_data_path, data_path) + return os.path.realpath(full_data_path) diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py b/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py index 12d59820c6fb61821a05fc644577bce0dd839a35..1d4c69d9ca34190e4bebee06cef0039550aa7f07 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py @@ -4,7 +4,8 @@ import inspect import torch import numpy as np from api_accuracy_checker.common.config import msCheckerConfig -from api_accuracy_checker.common.utils import print_error_log, write_pt, create_directory, DumpException +from api_accuracy_checker.common.utils import print_error_log, write_pt, create_directory, DumpException, \ + get_real_data_path from ptdbg_ascend.src.python.ptdbg_ascend.common.utils import check_path_before_create @@ -124,8 +125,9 @@ class APIInfo: file_path = os.path.join(self.save_path, f'{api_args}.pt') pt_path = write_pt(file_path, arg.contiguous().cpu().detach()) self.args_num += 1 + real_data_path = get_real_data_path(pt_path) single_arg.update({'type': 'torch.Tensor'}) - single_arg.update({'datapath': pt_path}) + single_arg.update({'datapath': real_data_path}) single_arg.update({'requires_grad': arg.requires_grad}) return single_arg diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py index 21bc23cfb86d79d99e9b42fb3d35e7a173e8de4a..bc8c4e1a2c2abf4dfb12aa0734ed38d02ff77769 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py @@ -19,8 +19,8 @@ import os import torch import numpy -from api_accuracy_checker.common.utils import Const, check_file_or_directory_path, check_object_type, print_warn_log, print_error_log, \ - CompareException +from api_accuracy_checker.common.utils import Const, check_file_or_directory_path, check_object_type, print_warn_log, \ + print_error_log, get_full_data_path, CompareException TORCH_TYPE = ["torch.device", "torch.dtype"] TENSOR_DATA_LIST = ["torch.Tensor", "torch.nn.parameter.Parameter"] @@ -31,7 +31,7 @@ NUMPY_TYPE = ["numpy.int8", "numpy.int16", "numpy.int32", "numpy.int64", "numpy. "numpy.complex128", "numpy.complex256", "numpy.bool_", "numpy.string_", "numpy.bytes_", "numpy.unicode_"] -def gen_data(info, need_grad, convert_type): +def gen_data(info, need_grad, convert_type, real_data_path=None): """ Function Description: Based on arg basic information, generate arg data @@ -43,6 +43,7 @@ def gen_data(info, need_grad, convert_type): check_object_type(info, dict) data_type = info.get('type') data_path = info.get('datapath') + data_path = get_full_data_path(data_path, real_data_path) if data_type in TENSOR_DATA_LIST: if data_path: data = gen_real_tensor(data_path, convert_type) @@ -168,7 +169,7 @@ def gen_bool_tensor(low, high, shape): return data -def gen_args(args_info, need_grad=True, convert_type=None): +def gen_args(args_info, need_grad=True, convert_type=None, real_data_path=None): """ Function Description: Based on API basic information, generate input parameters: args, for API forward running @@ -176,14 +177,15 @@ def gen_args(args_info, need_grad=True, convert_type=None): api_info: API basic information. List need_grad: set Tensor grad for backward convert_type: convert ori_type to dist_type flag. + real_data_path: the root directory for storing real data. """ check_object_type(args_info, list) args_result = [] for arg in args_info: if isinstance(arg, (list, tuple)): - data = gen_args(arg, need_grad, convert_type) + data = gen_args(arg, need_grad, convert_type, real_data_path) elif isinstance(arg, dict): - data = gen_data(arg, need_grad, convert_type) + data = gen_data(arg, need_grad, convert_type, real_data_path) else: print_warn_log(f'Warning: {arg} is not supported') raise NotImplementedError() @@ -191,21 +193,22 @@ def gen_args(args_info, need_grad=True, convert_type=None): return args_result -def gen_kwargs(api_info, convert_type=None): +def gen_kwargs(api_info, convert_type=None, real_data_path=None): """ Function Description: Based on API basic information, generate input parameters: kwargs, for API forward running Parameter: api_info: API basic information. Dict convert_type: convert ori_type to dist_type flag. + real_data_path: the root directory for storing real data. """ check_object_type(api_info, dict) kwargs_params = api_info.get("kwargs") for key, value in kwargs_params.items(): if isinstance(value, (list, tuple)): - kwargs_params[key] = gen_list_kwargs(value, convert_type) + kwargs_params[key] = gen_list_kwargs(value, convert_type, real_data_path) elif value.get('type') in TENSOR_DATA_LIST or value.get('type').startswith("numpy"): - kwargs_params[key] = gen_data(value, False, convert_type) + kwargs_params[key] = gen_data(value, False, convert_type, real_data_path) elif value.get('type') in TORCH_TYPE: gen_torch_kwargs(kwargs_params, key, value) else: @@ -220,7 +223,7 @@ def gen_torch_kwargs(kwargs_params, key, value): kwargs_params[key] = eval(value.get('value')) -def gen_list_kwargs(kwargs_item_value, convert_type): +def gen_list_kwargs(kwargs_item_value, convert_type, real_data_path=None): """ Function Description: When kwargs value is list, generate the list of kwargs result @@ -231,14 +234,14 @@ def gen_list_kwargs(kwargs_item_value, convert_type): kwargs_item_result = [] for item in kwargs_item_value: if item.get('type') in TENSOR_DATA_LIST: - item_value = gen_data(item, False, convert_type) + item_value = gen_data(item, False, convert_type, real_data_path) else: item_value = item.get('value') kwargs_item_result.append(item_value) return kwargs_item_result -def gen_api_params(api_info, need_grad=True, convert_type=None): +def gen_api_params(api_info, need_grad=True, convert_type=None, real_data_path=None): """ Function Description: Based on API basic information, generate input parameters: args, kwargs, for API forward running @@ -251,9 +254,9 @@ def gen_api_params(api_info, need_grad=True, convert_type=None): if convert_type and convert_type not in Const.CONVERT: error_info = f"convert_type params not support {convert_type}." raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info) - kwargs_params = gen_kwargs(api_info, convert_type) + kwargs_params = gen_kwargs(api_info, convert_type, real_data_path) if api_info.get("args"): - args_params = gen_args(api_info.get("args"), need_grad, convert_type) + args_params = gen_args(api_info.get("args"), need_grad, convert_type, real_data_path) else: print_warn_log(f'Warning: No args in {api_info} ') args_params = [] diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py index 7b23ea55793022183ea5aec788f7a2f71693d742..df8d8b153bd3959b4fd2d0f6ae06ed94fda93d66 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py @@ -66,9 +66,10 @@ def run_torch_api(api_full_name, api_info_dict): torch.npu.clear_npu_overflow_flag() api_type = api_full_name.split("_")[0] api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0] - args, kwargs, need_grad = get_api_info(api_info_dict, api_name) + args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path='') if not need_grad: - print_warn_log("%s function with out=... arguments don't support automatic differentiation, skip backward." % api_full_name) + print_warn_log("%s function with out=... arguments don't support automatic differentiation, skip backward." + % api_full_name) npu_args, npu_kwargs = generate_device_params(args, kwargs, False) if kwargs.get("device"): del kwargs["device"] diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 1e391e243355f4d207840b8fcb2e0d036162a646..5dd50566da4c188c7172eae87f7cfdc5a16c9a43 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -35,7 +35,7 @@ UT_ERROR_DATA_DIR = 'ut_error_data' + current_time RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv" DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv" RunUTConfig = namedtuple('RunUTConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path', - 'save_error_data', 'is_continue_run_ut']) + 'save_error_data', 'is_continue_run_ut', 'real_data_path']) not_backward_list = ['repeat_interleave'] tqdm_params = { @@ -140,7 +140,6 @@ def run_ut(config): if config.save_error_data: error_data_path = os.path.abspath(os.path.join(msCheckerConfig.error_data_path, UT_ERROR_DATA_DIR)) print_info_log(f"UT task error_datas will be saved in {error_data_path}") - api_setting_dict = get_json_contents("torch_ut_setting.json") compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut) with FileOpen(config.result_csv_path, 'r') as file: csv_reader = csv.reader(file) @@ -154,7 +153,7 @@ def run_ut(config): [_, api_name, _] = api_full_name.split("*") if api_name not in set(msCheckerConfig.white_list): continue - data_info = run_torch_api(api_full_name, api_setting_dict, config.backward_content, api_info_dict) + data_info = run_torch_api(api_full_name, config.real_data_path, config.backward_content, api_info_dict) is_fwd_success, is_bwd_success = compare.compare_output(api_full_name, data_info.bench_out, data_info.device_out, @@ -189,10 +188,10 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) UtAPIInfo(api_full_name + '.backward.output.device', data_info.device_grad_out) -def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): +def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict): in_fwd_data_list = [] [api_type, api_name, _] = api_full_name.split("*") - args, kwargs, need_grad = get_api_info(api_info_dict, api_name) + args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path) in_fwd_data_list.append(args) in_fwd_data_list.append(kwargs) need_backward = api_full_name in backward_content @@ -208,59 +207,54 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di del kwargs["device"] cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward) device_args, device_kwargs = generate_device_params(args, kwargs, need_backward) - grad_out, device_grad_out = None, None + bench_grad_out, device_grad_out = None, None out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) device_out = exec_api(api_type, api_name, device_args, device_kwargs) + api_setting_dict = get_json_contents("torch_ut_setting.json") grad_input_index = api_setting_dict.get(api_name) grad_index = None - grad = None + grad, bench_grad = None, None if grad_input_index is not None: grad_index = grad_input_index.get('grad_index') if need_backward: - grad_out, device_grad_out, grad, device_grad = run_backward( - api_full_name, cpu_args, backward_content, grad_index, device_args, device_out, out) + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args, real_data_path=real_data_path)[0] + bench_grad, _ = generate_cpu_params(grad, {}, False) + bench_grad_out = run_backward(cpu_args, bench_grad, grad_index, out) + device_grad = grad.clone().detach().to(current_device) + device_grad_out = run_backward(device_args, device_grad, grad_index, device_out) if grad_index is not None: - return UtDataInfo(grad_out, device_grad_out, device_out[grad_index], out[grad_index], grad, in_fwd_data_list) - return UtDataInfo(grad_out, device_grad_out, device_out, out, grad, in_fwd_data_list) + return UtDataInfo(bench_grad_out, device_grad_out, device_out[grad_index], out[grad_index], bench_grad, + in_fwd_data_list) + return UtDataInfo(bench_grad_out, device_grad_out, device_out, out, bench_grad, in_fwd_data_list) -def get_api_info(api_info_dict, api_name): +def get_api_info(api_info_dict, api_name, real_data_path): convert_type, api_info_dict = api_info_preprocess(api_name, api_info_dict) need_grad = True if api_info_dict.get("kwargs") and "out" in api_info_dict.get("kwargs"): need_grad = False - args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type) + args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type, real_data_path) return args, kwargs, need_grad -def run_backward(api_full_name, args, backward_content, grad_index, device_args, device_out, out): - backward_args = backward_content[api_full_name] - grad = gen_args(backward_args)[0] - cpu_grad, _ = generate_cpu_params(grad, {}, False) +def run_backward(args, grad, grad_index, out): + if grad_index is not None: - out[grad_index].backward(cpu_grad) + out[grad_index].backward(grad) elif isinstance(out, (list, tuple)): raise NotImplementedError("Multiple backward is not supported.") else: - out.backward(cpu_grad) + out.backward(grad) args_grad = [] for arg in args: if isinstance(arg, torch.Tensor): args_grad.append(arg.grad) grad_out = args_grad - device_grad = grad.clone().detach().to(current_device) - if grad_index is not None: - device_out[grad_index].backward(device_grad) - else: - device_out.backward(device_grad) - device_args_grad = [] - for arg in device_args: - if isinstance(arg, torch.Tensor): - device_args_grad.append(arg.grad) - device_grad_out = device_args_grad - return grad_out, device_grad_out, grad, device_grad + + return grad_out def initialize_save_error_data(): @@ -325,6 +319,10 @@ def _run_ut_parser(parser): help=" The path of accuracy_checking_result_{timestamp}.csv, " "when run ut is interrupted, enter the file path to continue run ut.", required=False) + parser.add_argument("-real_data_path", dest="real_data_path", nargs="?", const="", default="", type=str, + help=" In real data mode, the root directory for storing real data " + "must be configured.", + required=False) def _run_ut(): @@ -368,7 +366,7 @@ def _run_ut(): UT_ERROR_DATA_DIR = 'ut_error_data' + time_info initialize_save_error_data() run_ut_config = RunUTConfig(forward_content, backward_content, result_csv_path, details_csv_path, save_error_data, - args.result_csv_path) + args.result_csv_path, args.real_data_path) run_ut(run_ut_config) diff --git a/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py b/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py index fb0511b8e2e7e3a93934e93791d2e51c527cc20a..8951d5523ae2db277bf41e0417ae71456a27af4a 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py +++ b/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py @@ -11,7 +11,7 @@ class TestAPIInfo(unittest.TestCase): def setUp(self): if os.path.exists('./step-1'): shutil.rmtree('./step-1') - self.api = APIInfo("test_api", APIInfo.get_full_save_path("./", "forward", True), True) + self.api = APIInfo("test_api", APIInfo.get_full_save_path("./", "forward_real_data", True), True) def test_analyze_element(self): element = [1, 2, 3] @@ -24,7 +24,8 @@ class TestAPIInfo(unittest.TestCase): result = self.api._analyze_tensor(tensor) self.assertEqual(result.get('type'), 'torch.Tensor') self.assertTrue(result.get('requires_grad')) - self.assertTrue(os.path.exists(result.get('datapath'))) + datapath = result.get('datapath') + self.assertTrue(datapath.startswith('forward_real_data') or datapath.startswith('backward_real_data')) def test_analyze_builtin(self): arg = slice(1, 10, 2) diff --git a/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_dump_scopr.py b/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_dump_scope.py similarity index 90% rename from debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_dump_scopr.py rename to debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_dump_scope.py index b892a6077a3c26ae27343734aca8012e21d3fc2c..7712552abe49d757a07bcbbd746038ed22d4027b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_dump_scopr.py +++ b/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_dump_scope.py @@ -1,10 +1,12 @@ import unittest -from api_accuracy_checker.dump.dump_scope import * +from api_accuracy_checker.dump.dump_scope import iter_tracer from api_accuracy_checker.dump.dump import DumpUtil + class TestDumpScope(unittest.TestCase): def test_iter_tracer(self): DumpUtil.call_num = 0 + def dummy_func(): return "Hello, World!" diff --git a/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py b/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py index fff5d6e4bd7c978448576f4b328ab57e3ebc0b81..50f9131e47514bf3bb5c2e6dc06ad52b261a51b7 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py +++ b/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py @@ -19,7 +19,7 @@ min_value = -5.125 class TestDataGenerateMethods(unittest.TestCase): def test_gen_api_params(self): api_info = copy.deepcopy(api_info_dict) - args_params, kwargs_params = gen_api_params(api_info, True, None) + args_params, kwargs_params = gen_api_params(api_info, True, None, None) max_diff = abs(args_params[0].max() - max_value) min_diff = abs(args_params[0].min() - min_value) self.assertEqual(len(args_params), 1) @@ -30,7 +30,7 @@ class TestDataGenerateMethods(unittest.TestCase): self.assertEqual(kwargs_params, {'inplace': False}) def test_gen_args(self): - args_result = gen_args(api_info_dict.get('args')) + args_result = gen_args(api_info_dict.get('args'), real_data_path=None) max_diff = abs(args_result[0].max() - max_value) min_diff = abs(args_result[0].min() - min_value) self.assertEqual(len(args_result), 1) @@ -40,7 +40,7 @@ class TestDataGenerateMethods(unittest.TestCase): self.assertEqual(args_result[0].shape, torch.Size([2, 2560, 24, 24])) def test_gen_data(self): - data = gen_data(api_info_dict.get('args')[0], True, None) + data = gen_data(api_info_dict.get('args')[0], True, None, None) max_diff = abs(data.max() - max_value) min_diff = abs(data.min() - min_value) self.assertEqual(data.dtype, torch.float32) diff --git a/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_run_ut.py b/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_run_ut.py index bca4f0b30886c7c1b907118f6b1510a9d3eb4160..21ec2f0072c7b9dba5a93234ce476b48e13d5622 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_run_ut.py @@ -18,7 +18,7 @@ class TestRunUtMethods(unittest.TestCase): def test_exec_api(self): api_info = copy.deepcopy(api_info_dict) [api_type, api_name, _] = api_full_name.split("*") - args, kwargs, need_grad = get_api_info(api_info, api_name) + args, kwargs, need_grad = get_api_info(api_info, api_name, None) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, True) out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) self.assertEqual(out.dtype, torch.float64) @@ -52,7 +52,7 @@ class TestRunUtMethods(unittest.TestCase): def test_generate_cpu_params(self): api_info = copy.deepcopy(api_info_dict) [api_type, api_name, _] = api_full_name.split("*") - args, kwargs, need_grad = get_api_info(api_info, api_name) + args, kwargs, need_grad = get_api_info(api_info, api_name, None) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, True) self.assertEqual(len(cpu_args), 1) self.assertEqual(cpu_args[0].dtype, torch.float64)