diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..d7cab76d063100618cf2398818906170c3992f1f --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py @@ -0,0 +1,155 @@ +import argparse +import os +import sys +import torch_npu +import torch +from tqdm import tqdm +from api_accuracy_checker.run_ut.run_ut import exec_api, generate_npu_params, run_backward, init_environment, \ + get_api_info +from api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, api_info_preprocess, \ + print_error_log + + +NO_GRAD_APIS = ["hardtanh"] + +init_environment() + + +def check_tensor_overflow(x): + if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool: + if len(x.shape) == 0: + tensor_max = x.cpu().detach().float().numpy().tolist() + tensor_min = tensor_max + else: + tensor_max = torch._C._VariableFunctionsClass.max(x).cpu().detach().float().numpy().tolist() + tensor_min = torch._C._VariableFunctionsClass.min(x).cpu().detach().float().numpy().tolist() + # inf + if tensor_max == float('inf') or tensor_min == float('-inf'): + return True + # nan + elif tensor_max != tensor_max or tensor_min != tensor_min: + return True + else: + return False + elif isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): + if x == float('inf') or x == float('-inf') or x != x: + return True + else: + return False + else: + return False + + +def check_data_overflow(x): + if isinstance(x, (tuple, list)) and x: + for i, item in enumerate(x): + if check_data_overflow(item): + return True + return False + else: + return check_tensor_overflow(x) + + +def run_overflow_check(forward_file, backward_file): + print_info_log("start UT test") + forward_content = get_json_contents(forward_file) + backward_content = {} + if backward_file: + backward_content = get_json_contents(backward_file) + api_setting_dict = get_json_contents("torch_ut_setting.json") + for api_full_name, api_info_dict in tqdm(forward_content.items()): + try: + run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict) + except Exception as err: + api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0] + if "not implemented for 'Half'" in str(err): + print_warn_log(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API " + f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + elif "expected scalar type Long" in str(err): + print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API " + f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + else: + print_error_log(f"Run {api_full_name} UT Error: %s" % str(err)) + + +def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): + torch.npu.clear_npu_overflow_flag() + api_type = api_full_name.split("_")[0] + api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0] + args, inplace, kwargs, need_grad = get_api_info(api_info_dict, api_name) + need_backward = api_full_name.replace("forward", "backward") in backward_content and api_name[-1] != "_" and \ + inplace is not True + need_backward = need_backward and need_grad + if inplace or not need_grad: + print_warn_log("%s involves in-place operations, skip backward" % api_full_name) + npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) + if kwargs.get("device"): + del kwargs["device"] + out = exec_api(api_type, api_name, args, kwargs) + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + if not need_backward: + cpu_overflow = check_data_overflow(out) + npu_overflow = torch_npu.npu.utils.npu_check_overflow(npu_out) + if cpu_overflow == npu_overflow: + print_warn_log("The %s overflow is a normal overflow." % api_full_name) + else: + print_warn_log("The %s overflow is an abnormal overflow." % api_full_name) + return + else: + api_full_name = api_full_name.replace("forward", "backward") + grad_input_index = api_setting_dict.get(api_name) + grad_index = None + if grad_input_index is not None: + grad_index = grad_input_index.get('grad_index') + + grad_out, npu_grad_out = run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out) + + cpu_overflow = check_data_overflow(grad_out) + npu_overflow = torch_npu.npu.utils.npu_check_overflow(npu_grad_out) + if cpu_overflow == npu_overflow: + print_warn_log("The %s overflow is a normal overflow." % api_full_name) + else: + print_warn_log("The %s overflow is an abnormal overflow." % api_full_name) + return + + +def _run_ut_parser(parser): + parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", + help=" The api param tool forward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-backward", "--backward_input_file", dest="backward_input_file", default="", + help=" The api param tool backward result file: generate from api param tool, " + "a json file.", + required=False) + parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", + default=False, required=False) + parser.add_argument("-d", "--device", dest="device_id", type=int, help=" set NPU device id to run ut", + default=0, required=False) + + +def _run_overflow_check(): + parser = argparse.ArgumentParser() + _run_ut_parser(parser) + args = parser.parse_args(sys.argv[1:]) + torch.npu.set_compile_mode(jit_compile=args.jit_compile) + npu_device = "npu:" + str(args.device_id) + forward_file = os.path.realpath(args.forward_input_file) + backward_file = "" + if args.backward_input_file: + backward_file = os.path.realpath(args.backward_input_file) + if not backward_file.endswith(".json"): + raise ValueError("The backward_input_file should be a json file!") + if not forward_file.endswith(".json"): + raise ValueError("The forward_input_file should be a json file!") + try: + torch.npu.set_device(npu_device) + except Exception: + print_error_log(f"Set NPU device id failed. device id is: {args.device_id}") + raise NotImplementedError + run_overflow_check(forward_file, backward_file) + + +if __name__ == '__main__': + _run_overflow_check() + print_info_log("UT task completed.") diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index ee0a45e6faf3a8058164219899874c7fdd751817..ea69e5c61b06c10ad1392b44e0b7cf7165320d9c 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,185 +1,198 @@ -import argparse -import os -import sys -import torch_npu -import yaml -import torch -from tqdm import tqdm -from api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args -from api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, api_info_preprocess, \ - print_error_log -from api_accuracy_checker.compare.compare import Comparator -from api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate -from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate -from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate - -NO_GRAD_APIS = ["hardtanh"] - -cur_path = os.path.dirname(os.path.realpath(__file__)) -yaml_path = os.path.join(cur_path, "../hook_module/support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: - WrapFunctionalOps = yaml.safe_load(f).get('functional') - -for f in dir(torch.nn.functional): - if f != "__name__": - locals().update({f: getattr(torch.nn.functional, f)}) - - -def exec_api(api_type, api_name, args, kwargs): - if api_type == "Functional": - functional_api = FunctionalOPTemplate(api_name, str, False) - out = functional_api.forward(*args, **kwargs) - if api_type == "Tensor": - tensor_api = TensorOPTemplate(api_name, str, False) - out = tensor_api.forward(*args, **kwargs) - if api_type == "Torch": - torch_api = TorchOPTemplate(api_name, str, False) - out = torch_api.forward(*args, **kwargs) - return out - - -def generate_npu_params(cpu_args, cpu_kwargs, need_backward): - def recursive_arg_to_npu(arg_in): - if isinstance(arg_in, (list, tuple)): - return type(arg_in)(recursive_arg_to_npu(arg) for arg in arg_in) - elif isinstance(arg_in, torch.Tensor): - if need_backward and arg_in.requires_grad: - arg_in = arg_in.clone().detach().to("npu").requires_grad_() - temp_arg_in = arg_in * 1 - arg_in = temp_arg_in.type_as(arg_in) - arg_in.retain_grad() - return arg_in - else: - return arg_in.clone().detach().to("npu") - else: - return arg_in - - npu_args = recursive_arg_to_npu(cpu_args) - npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in cpu_kwargs.items()} - return npu_args, npu_kwargs - - -def run_ut(forward_file, backward_file, out_path, save_error_data): - print_info_log("start UT test") - forward_content = get_json_contents(forward_file) - backward_content = get_json_contents(backward_file) - api_setting_dict = get_json_contents("torch_ut_setting.json") - compare = Comparator(out_path) - for api_full_name, api_info_dict in tqdm(forward_content.items()): - try: - grad_out, npu_grad_out, npu_out, out = run_torch_api(api_full_name, api_setting_dict, backward_content, - api_info_dict) - compare.compare_output(api_full_name, out, npu_out, grad_out, npu_grad_out) - except Exception as err: - [_, api_name, _] = api_full_name.split("*") - if "not implemented for 'Half'" in str(err): - print_warn_log(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API " - f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.") - elif "expected scalar type Long" in str(err): - print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API " - f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.") - else: - print_error_log(f"Run {api_full_name} UT Error: %s" % str(err)) - - compare.print_pretest_result() - compare.write_compare_csv() - - -def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): - [api_type, api_name, _] = api_full_name.split("*") - convert_type, api_info_dict = api_info_preprocess(api_name, api_info_dict) - need_grad = True - if api_info_dict.get("kwargs") and "out" in api_info_dict.get("kwargs"): - need_grad = False - if api_name[-1] == "_" or api_name in NO_GRAD_APIS: - need_grad = False - args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type) - inplace = kwargs.get("inplace") if kwargs.get("inplace") else None - need_backward = api_full_name in backward_content and api_name[-1] != "_" and inplace is not True - need_backward = need_backward and need_grad - if inplace or not need_grad: - print_warn_log("%s involves in-place operations, skip backward" % api_full_name) - npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) - grad_out, npu_grad_out = None, None - if kwargs.get("device"): - del kwargs["device"] - out = exec_api(api_type, api_name, args, kwargs) - npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) - grad_input_index = api_setting_dict.get(api_name) - grad_index = None - if grad_input_index is not None: - grad_index = grad_input_index.get('grad_index') - - if need_backward: - backward_args = backward_content[api_full_name] - grad = gen_args(backward_args)[0] - if grad_index is not None: - out[grad_index].backward(grad) - elif isinstance(out, (list, tuple)): - raise NotImplementedError("Multiple backward is not supported.") - else: - out.backward(grad) - args_grad = [] - for arg in args: - if isinstance(arg, torch.Tensor): - args_grad.append(arg.grad) - grad_out = args_grad - - npu_grad = grad.clone().detach().npu() - if grad_index is not None: - npu_out[grad_index].backward(npu_grad) - else: - npu_out.backward(npu_grad) - npu_args_grad = [] - for arg in npu_args: - if isinstance(arg, torch.Tensor): - npu_args_grad.append(arg.grad) - npu_grad_out = npu_args_grad - if grad_index is not None: - return grad_out, npu_grad_out, npu_out[grad_index], out[grad_index] - return grad_out, npu_grad_out, npu_out, out - - -def _run_ut_parser(parser): - parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", - help=" The api param tool forward result file: generate from api param tool, " - "a json file.", - required=True) - parser.add_argument("-backward", "--backward_input_file", dest="backward_input_file", default="", - help=" The api param tool backward result file: generate from api param tool, " - "a json file.", - required=True) - parser.add_argument("-o", "--out_path", dest="out_path", default="", - help=" The ut task result out path.", - required=False) - parser.add_argument('-save_error_data', dest="save_error_data", action="store_true", - help=" Save compare failed api output.", required=False) - parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", - default=False, required=False) - parser.add_argument("-d", "--device", dest="device_id", type=int, help=" set NPU device id to run ut", - default=0, required=False) - - -def _run_ut(): - parser = argparse.ArgumentParser() - _run_ut_parser(parser) - args = parser.parse_args(sys.argv[1:]) - torch.npu.set_compile_mode(jit_compile=args.jit_compile) - npu_device = "npu:" + str(args.device_id) - try: - torch.npu.set_device(npu_device) - except Exception: - print_error_log(f"Set NPU device id failed. device id is: {args.device_id}") - raise NotImplementedError - forward_file = os.path.realpath(args.forward_input_file) - backward_file = os.path.realpath(args.backward_input_file) - if not forward_file.endswith(".json") or not backward_file.endswith(".json"): - raise ValueError("The forward_input_file and backward_input_file should be a json file!") - out_path = os.path.realpath(args.out_path) if args.out_path else "./" - save_error_data = args.save_error_data - run_ut(forward_file, backward_file, out_path, save_error_data) - - -if __name__ == '__main__': - _run_ut() - print_info_log("UT task completed.") +import argparse +import os +import sys +import torch_npu +import yaml +import torch +from tqdm import tqdm +from api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args +from api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, api_info_preprocess, \ + print_error_log +from api_accuracy_checker.compare.compare import Comparator +from api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate +from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate +from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate + +NO_GRAD_APIS = ["hardtanh"] + + +def init_environment(): + cur_path = os.path.dirname(os.path.realpath(__file__)) + yaml_path = os.path.join(cur_path, "../hook_module/support_wrap_ops.yaml") + with open(yaml_path, 'r') as f: + WrapFunctionalOps = yaml.safe_load(f).get('functional') + for f in dir(torch.nn.functional): + if f != "__name__": + locals().update({f: getattr(torch.nn.functional, f)}) + + +init_environment() + + +def exec_api(api_type, api_name, args, kwargs): + if api_type == "Functional": + functional_api = FunctionalOPTemplate(api_name, str, False) + out = functional_api.forward(*args, **kwargs) + if api_type == "Tensor": + tensor_api = TensorOPTemplate(api_name, str, False) + out = tensor_api.forward(*args, **kwargs) + if api_type == "Torch": + torch_api = TorchOPTemplate(api_name, str, False) + out = torch_api.forward(*args, **kwargs) + return out + + +def generate_npu_params(cpu_args, cpu_kwargs, need_backward): + def recursive_arg_to_npu(arg_in): + if isinstance(arg_in, (list, tuple)): + return type(arg_in)(recursive_arg_to_npu(arg) for arg in arg_in) + elif isinstance(arg_in, torch.Tensor): + if need_backward and arg_in.requires_grad: + arg_in = arg_in.clone().detach().to("npu").requires_grad_() + temp_arg_in = arg_in * 1 + arg_in = temp_arg_in.type_as(arg_in) + arg_in.retain_grad() + return arg_in + else: + return arg_in.clone().detach().to("npu") + else: + return arg_in + + npu_args = recursive_arg_to_npu(cpu_args) + npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in cpu_kwargs.items()} + return npu_args, npu_kwargs + + +def run_ut(forward_file, backward_file, out_path, save_error_data): + print_info_log("start UT test") + forward_content = get_json_contents(forward_file) + backward_content = get_json_contents(backward_file) + api_setting_dict = get_json_contents("torch_ut_setting.json") + compare = Comparator(out_path) + for api_full_name, api_info_dict in tqdm(forward_content.items()): + try: + grad_out, npu_grad_out, npu_out, out = run_torch_api(api_full_name, api_setting_dict, backward_content, + api_info_dict) + compare.compare_output(api_full_name, out, npu_out, grad_out, npu_grad_out) + except Exception as err: + [_, api_name, _] = api_full_name.split("*") + if "not implemented for 'Half'" in str(err): + print_warn_log(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API " + f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + elif "expected scalar type Long" in str(err): + print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API " + f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + else: + print_error_log(f"Run {api_full_name} UT Error: %s" % str(err)) + + compare.print_pretest_result() + compare.write_compare_csv() + + +def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): + [api_type, api_name, _] = api_full_name.split("*") + args, inplace, kwargs, need_grad = get_api_info(api_info_dict, api_name) + need_backward = api_full_name in backward_content and api_name[-1] != "_" and inplace is not True + need_backward = need_backward and need_grad + if inplace or not need_grad: + print_warn_log("%s involves in-place operations, skip backward" % api_full_name) + npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) + grad_out, npu_grad_out = None, None + if kwargs.get("device"): + del kwargs["device"] + out = exec_api(api_type, api_name, args, kwargs) + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + grad_input_index = api_setting_dict.get(api_name) + grad_index = None + if grad_input_index is not None: + grad_index = grad_input_index.get('grad_index') + + if need_backward: + grad_out, npu_grad_out = run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out) + if grad_index is not None: + return grad_out, npu_grad_out, npu_out[grad_index], out[grad_index] + return grad_out, npu_grad_out, npu_out, out + + +def get_api_info(api_info_dict, api_name): + convert_type, api_info_dict = api_info_preprocess(api_name, api_info_dict) + need_grad = True + if api_info_dict.get("kwargs") and "out" in api_info_dict.get("kwargs"): + need_grad = False + if api_name[-1] == "_" or api_name in NO_GRAD_APIS: + need_grad = False + args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type) + inplace = kwargs.get("inplace") if kwargs.get("inplace") else None + return args, inplace, kwargs, need_grad + + +def run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out): + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + npu_grad = grad.clone().detach().npu() + if grad_index is not None: + npu_out[grad_index].backward(npu_grad) + else: + npu_out.backward(npu_grad) + npu_args_grad = [] + for arg in npu_args: + if isinstance(arg, torch.Tensor): + npu_args_grad.append(arg.grad) + npu_grad_out = npu_args_grad + return grad_out, npu_grad_out + + +def _run_ut_parser(parser): + parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", + help=" The api param tool forward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-backward", "--backward_input_file", dest="backward_input_file", default="", + help=" The api param tool backward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-o", "--out_path", dest="out_path", default="", + help=" The ut task result out path.", + required=False) + parser.add_argument('-save_error_data', dest="save_error_data", action="store_true", + help=" Save compare failed api output.", required=False) + parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", + default=False, required=False) + parser.add_argument("-d", "--device", dest="device_id", type=int, help=" set NPU device id to run ut", + default=0, required=False) + + +def _run_ut(): + parser = argparse.ArgumentParser() + _run_ut_parser(parser) + args = parser.parse_args(sys.argv[1:]) + torch.npu.set_compile_mode(jit_compile=args.jit_compile) + npu_device = "npu:" + str(args.device_id) + try: + torch.npu.set_device(npu_device) + except Exception: + print_error_log(f"Set NPU device id failed. device id is: {args.device_id}") + raise NotImplementedError + forward_file = os.path.realpath(args.forward_input_file) + backward_file = os.path.realpath(args.backward_input_file) + if not forward_file.endswith(".json") or not backward_file.endswith(".json"): + raise ValueError("The forward_input_file and backward_input_file should be a json file!") + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + save_error_data = args.save_error_data + run_ut(forward_file, backward_file, out_path, save_error_data) + + +if __name__ == '__main__': + _run_ut() + print_info_log("UT task completed.")