diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 24959a62950f03974aaf2822dc5cc83747475cfd..eb84f1a772c5b511708f5a519b8df0251180b43d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -15,6 +15,7 @@ # limitations under the License. """ import collections +import json import os import random import re @@ -456,3 +457,58 @@ def get_process_rank(model): return 0, False else: return device.index, True + + +def get_json_contents(file_path): + ops = get_file_content_bytes(file_path) + return json.loads(ops) + + +def get_file_content_bytes(file): + check_input_file_valid(file) + with open(file, 'rb') as file_handle: + return file_handle.read() + + +def islink(path): + path = os.path.abspath(path) + return os.path.islink(path) + + +class SoftlinkCheckException(Exception): + pass + + +MAX_JSON_FILE_SIZE = 10 * 1024 ** 2 +LINUX_FILE_NAME_LENGTH_LIMIT = 200 + + +def check_path_length_valid(path): + path = os.path.realpath(path) + return len(os.path.basename(path) <= LINUX_FILE_NAME_LENGTH_LIMIT) + + +def check_path_pattern_valid(path): + pattern = re.compile(r'(\.|/|:|_|-|\s|[~0-9a-zA-Z])+') + if not pattern.fullmatch(path): + raise ValueError('Only the following characters are allowed in the path: A-Z a-z 0-9 - _ . / :') + + +def check_input_file_valid(input_path, max_file_size=MAX_JSON_FILE_SIZE): + if islink(input_path): + raise SoftlinkCheckException("Input path doesn't support soft link.") + + input_path = os.path.realpath(input_path) + if not os.path.exists(input_path): + raise ValueError('Input file %s does not exist!' % input_path) + + if not os.access(input_path, os.R_OK): + raise PermissionError('Input file %s is not readable!' % input_path) + + check_path_pattern_valid(input_path) + + if not check_path_length_valid(input_path): + raise ValueError("The real path or file_name of input is too long.") + + if os.path.getsize(input_path) > max_file_size: + raise ValueError(f'The file is too large, exceeds {max_file_size // 1024 ** 2}MB') \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/api_accuracy_checker/hook_module/support_wrap_ops.yaml index 6aefa1460b3df5d5c5cd5a69795a093053c47824..476de2b0df89d8653a1c92cce734f9b73c7be02d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/hook_module/support_wrap_ops.yaml @@ -146,7 +146,6 @@ tensor: - __eq__ - __ge__ - __gt__ - - __getitem__ - __iadd__ - __iand__ - __idiv__ diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 79e6ed365fca0ab0669527279251a659ee63559b..3a0c9c45155f8c4b64efc96c020ad68ec3b51901 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,93 +1,156 @@ -# 用户构造并运行api用例,注意前反向的区分 -import yaml +import argparse import os -import json +import sys +sys.path.append("..") +import yaml import torch - -FLOAT_TYPE = ['torch.float32', 'torch.float', 'torch.float64', 'torch.double', 'torch.float16', \ - 'torch.half', 'torch.bfloat16'] +from data_generate import gen_api_params, gen_args +from common.utils import print_info_log, print_warn_log, get_json_contents +from compare.compare import Comparator cur_path = os.path.dirname(os.path.realpath(__file__)) -yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +yaml_path = os.path.join(cur_path, "../hook_module/support_wrap_ops.yaml") with open(yaml_path, 'r') as f: WrapFunctionalOps = yaml.safe_load(f).get('functional') for f in dir(torch.nn.functional): - locals().update({f: getattr(torch.nn.functional, f)}) - - -def run_ut(): - print("start") - forward_pkl = open("/home/wangchao/torch_test/dump_data_new/npu/ptdbg_dump_v1.0/rank0/dump.pkl") - backward_pkl = open("/home/wangchao/torch_test/dump_data_new/npu/ptdbg_dump_v1.0/rank0/dump_backward.pkl") - forward_content = forward_pkl.readlines() - backward_content = backward_pkl.readlines() - for api_info in forward_content: - api_json = json.loads(api_info) - for key, value in api_json.items(): - [api_type, api_name, index, mode] = key.split("*") - print(api_name) - api_feature = key.rsplit("*", 1)[0] - args, kwargs = generate_input(value.get("args"), api_json.get("kwargs")) - if api_type == "Functional": - out = eval(api_name)(*args, **kwargs) - if api_type == "Tensor": - out = getattr(torch._C._TensorBase, str(api_name))(*args, **kwargs) - for line in backward_content: - if api_feature in line: - api_back_json = json.loads(line) - for params in api_back_json.values(): - grad = nested_generate_input(params.get("args"), True, False) - out.backward(grad) - input_grad = [tensor.grad for tensor in args if isinstance(tensor, torch.Tensor)] - print("forward") - print(out) - print("backward") - print(input_grad) - - -def generate_input(input_args, input_kwargs, need_backward=True, need_convert=False): # 没有考虑dict of tensor - args = [] - kwargs = {} - - for info in input_args: - args.append(nested_generate_input(info, need_backward, need_convert)) - if kwargs: - for key, info in input_kwargs.items(): - kwargs[key] = nested_generate_input(info, need_backward, need_convert) - return args, kwargs - - -def nested_generate_input(info, need_backward, need_convert): - if isinstance(info, list): - result = [] - for i in info: - result.append(nested_generate_input(i, need_backward, need_convert)) - return result - # return list(map(nested_generate_input, info)) - # elif isinstance(input_info, tuple): - # return tuple(map(generate_input, input_info)) + if f != "__name__": + locals().update({f: getattr(torch.nn.functional, f)}) + + +def exec_api(api_type, api_name, args, kwargs): + if api_type == "Functional": + out = eval(api_name)(*args, **kwargs) + if api_type == "Tensor": + out = getattr(torch._C._TensorBase, str(api_name))(*args, **kwargs) + if api_type == "Torch": + out = getattr(torch._C._VariableFunctionsClass, str(api_name))(*args, **kwargs) + return out + + +def generate_npu_params(cpu_args, cpu_kwargs, need_backward): + npu_args = [] + npu_kwargs = {} + if need_backward: + for arg_in in cpu_args: + arg_in = arg_to_npu(arg_in) + npu_args.append(arg_in) + for key, value in cpu_kwargs.items(): + value = arg_to_npu(value) + npu_kwargs[key] = value else: - if info['type'] == 'torch.Tensor': - low, high = info['Min'], info['Max'] - data_dtype = info['dtype'] - if data_dtype in FLOAT_TYPE: #应该搞个float类型列表 - if need_convert and data_dtype == "torch.float16": - data_dtype = "torch.float32" - scale = high - low - rand01 = torch.rand(tuple(info['shape']), dtype=eval(data_dtype)) - inpt = rand01 * scale + low - if need_backward: - inpt.requires_grad_(True) - inpt.retain_grad() - elif 'int' in data_dtype or 'long' in data_dtype: # 应该搞个int类型列表, - inpt = torch.randint(int(low), int(high)+1, tuple(info['shape']), - dtype=eval(data_dtype)) # high + 1因为右边是开区间 - else: - print(f'Warning: Dtype is not supported: ', info['dtype']) - raise NotImplementedError() + for arg_in in cpu_args: + if isinstance(arg_in, torch.Tensor): + arg_in = arg_in.clone().detach().to("npu") + npu_args.append(arg_in) + for key, value in cpu_kwargs.items(): + if isinstance(value, torch.Tensor): + value = value.clone().detach().to("npu") + npu_kwargs[key] = value + return npu_args, npu_kwargs + + +def arg_to_npu(arg_in): + if isinstance(arg_in, torch.Tensor) and arg_in.dtype in [torch.float, torch.float16, + torch.float64] and arg_in.requires_grad: + arg_in = arg_in.clone().detach().to("npu").requires_grad_() + elif isinstance(arg_in, torch.Tensor): + arg_in = arg_in.clone().detach().to("npu") + return arg_in + + +def run_ut(forward_file, backward_file, out_path, save_error_data): + print_info_log("start UT test") + forward_content = get_json_contents(forward_file) + backward_content = get_json_contents(backward_file) + api_setting_dict = get_json_contents("torch_ut_setting.json") + compare = Comparator(out_path) + for api_full_name, api_info_dict in forward_content.items(): + grad_out, npu_grad_out, npu_out, out = run_torch_api(api_full_name, api_setting_dict, backward_content, + api_info_dict) + compare.compare_output(api_full_name, out, npu_out, grad_out, npu_grad_out) + + compare.print_pretest_result() + compare.write_compare_csv() + + +def run_torch_api(api_full_name, api_setting_dict, backward_content, value): + [api_type, api_name, _] = api_full_name.split("*") + args, kwargs = gen_api_params(value, api_name[-1] != "_") + inplace = kwargs.get("inplace") if kwargs.get("inplace") else None + need_backward = api_full_name in backward_content and api_name[-1] != "_" and inplace is not True + if inplace or api_name[-1] == "_": + print_warn_log("%s involves in-place operations, skip backward" % api_full_name) + npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) + grad_out, npu_grad_out = None, None + out = exec_api(api_type, api_name, args, kwargs) + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + grad_input_index = api_setting_dict.get(api_name) + grad_index = None + if grad_input_index is not None: + grad_index = grad_input_index.get('grad_index') + + if need_backward: + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + + npu_grad = grad.clone().detach().npu() + if grad_index is not None: + npu_out[grad_index].backward(npu_grad) else: - inpt = info['value'] # 遗留问题:需要考虑是否要转换成原本类型 - return inpt + npu_out.backward(npu_grad) + npu_args_grad = [] + for arg in npu_args: + if isinstance(arg, torch.Tensor): + npu_args_grad.append(arg.grad) + npu_grad_out = npu_args_grad + return grad_out, npu_grad_out, npu_out, out + + +def _run_ut_parser(parser): + parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", + help=" The api param tool forward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-backward", "--backward_input_file", dest="backward_input_file", default="", + help=" The api param tool backward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-o", "--out_path", dest="out_path", default="", + help=" The ut task result out path.", + required=False) + parser.add_argument('-save_error_data', dest="save_error_data", action="store_true", + help=" Save compare failed api output.", required=False) + parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", + default=True, required=False) + + +def _run_ut(): + parser = argparse.ArgumentParser() + _run_ut_parser(parser) + args = parser.parse_args(sys.argv[1:]) + if not args.jit_compile: + torch.npu.set_compile_mode(jit_compile=False) + forward_file = os.path.realpath(args.forward_input_file) + backward_file = os.path.realpath(args.backward_input_file) + if not forward_file.endswith(".json") or not backward_file.endswith(".json"): + raise ValueError("The forward_input_file and backward_input_file should be a json file!") + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + save_error_data = args.save_error_data + run_ut(forward_file, backward_file, out_path, save_error_data) + -run_ut() \ No newline at end of file +if __name__ == '__main__': + _run_ut() + print_info_log("UT task completed.") \ No newline at end of file