diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..d7cab76d063100618cf2398818906170c3992f1f --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_overflow_check.py @@ -0,0 +1,155 @@ +import argparse +import os +import sys +import torch_npu +import torch +from tqdm import tqdm +from api_accuracy_checker.run_ut.run_ut import exec_api, generate_npu_params, run_backward, init_environment, \ + get_api_info +from api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, api_info_preprocess, \ + print_error_log + + +NO_GRAD_APIS = ["hardtanh"] + +init_environment() + + +def check_tensor_overflow(x): + if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool: + if len(x.shape) == 0: + tensor_max = x.cpu().detach().float().numpy().tolist() + tensor_min = tensor_max + else: + tensor_max = torch._C._VariableFunctionsClass.max(x).cpu().detach().float().numpy().tolist() + tensor_min = torch._C._VariableFunctionsClass.min(x).cpu().detach().float().numpy().tolist() + # inf + if tensor_max == float('inf') or tensor_min == float('-inf'): + return True + # nan + elif tensor_max != tensor_max or tensor_min != tensor_min: + return True + else: + return False + elif isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): + if x == float('inf') or x == float('-inf') or x != x: + return True + else: + return False + else: + return False + + +def check_data_overflow(x): + if isinstance(x, (tuple, list)) and x: + for i, item in enumerate(x): + if check_data_overflow(item): + return True + return False + else: + return check_tensor_overflow(x) + + +def run_overflow_check(forward_file, backward_file): + print_info_log("start UT test") + forward_content = get_json_contents(forward_file) + backward_content = {} + if backward_file: + backward_content = get_json_contents(backward_file) + api_setting_dict = get_json_contents("torch_ut_setting.json") + for api_full_name, api_info_dict in tqdm(forward_content.items()): + try: + run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict) + except Exception as err: + api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0] + if "not implemented for 'Half'" in str(err): + print_warn_log(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API " + f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + elif "expected scalar type Long" in str(err): + print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API " + f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + else: + print_error_log(f"Run {api_full_name} UT Error: %s" % str(err)) + + +def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): + torch.npu.clear_npu_overflow_flag() + api_type = api_full_name.split("_")[0] + api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0] + args, inplace, kwargs, need_grad = get_api_info(api_info_dict, api_name) + need_backward = api_full_name.replace("forward", "backward") in backward_content and api_name[-1] != "_" and \ + inplace is not True + need_backward = need_backward and need_grad + if inplace or not need_grad: + print_warn_log("%s involves in-place operations, skip backward" % api_full_name) + npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) + if kwargs.get("device"): + del kwargs["device"] + out = exec_api(api_type, api_name, args, kwargs) + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + if not need_backward: + cpu_overflow = check_data_overflow(out) + npu_overflow = torch_npu.npu.utils.npu_check_overflow(npu_out) + if cpu_overflow == npu_overflow: + print_warn_log("The %s overflow is a normal overflow." % api_full_name) + else: + print_warn_log("The %s overflow is an abnormal overflow." % api_full_name) + return + else: + api_full_name = api_full_name.replace("forward", "backward") + grad_input_index = api_setting_dict.get(api_name) + grad_index = None + if grad_input_index is not None: + grad_index = grad_input_index.get('grad_index') + + grad_out, npu_grad_out = run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out) + + cpu_overflow = check_data_overflow(grad_out) + npu_overflow = torch_npu.npu.utils.npu_check_overflow(npu_grad_out) + if cpu_overflow == npu_overflow: + print_warn_log("The %s overflow is a normal overflow." % api_full_name) + else: + print_warn_log("The %s overflow is an abnormal overflow." % api_full_name) + return + + +def _run_ut_parser(parser): + parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", + help=" The api param tool forward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-backward", "--backward_input_file", dest="backward_input_file", default="", + help=" The api param tool backward result file: generate from api param tool, " + "a json file.", + required=False) + parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", + default=False, required=False) + parser.add_argument("-d", "--device", dest="device_id", type=int, help=" set NPU device id to run ut", + default=0, required=False) + + +def _run_overflow_check(): + parser = argparse.ArgumentParser() + _run_ut_parser(parser) + args = parser.parse_args(sys.argv[1:]) + torch.npu.set_compile_mode(jit_compile=args.jit_compile) + npu_device = "npu:" + str(args.device_id) + forward_file = os.path.realpath(args.forward_input_file) + backward_file = "" + if args.backward_input_file: + backward_file = os.path.realpath(args.backward_input_file) + if not backward_file.endswith(".json"): + raise ValueError("The backward_input_file should be a json file!") + if not forward_file.endswith(".json"): + raise ValueError("The forward_input_file should be a json file!") + try: + torch.npu.set_device(npu_device) + except Exception: + print_error_log(f"Set NPU device id failed. device id is: {args.device_id}") + raise NotImplementedError + run_overflow_check(forward_file, backward_file) + + +if __name__ == '__main__': + _run_overflow_check() + print_info_log("UT task completed.") diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index ee0a45e6faf3a8058164219899874c7fdd751817..e5737b511068814d85ffb3d3eb5654288a46c93f 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -15,14 +15,18 @@ from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate NO_GRAD_APIS = ["hardtanh"] -cur_path = os.path.dirname(os.path.realpath(__file__)) -yaml_path = os.path.join(cur_path, "../hook_module/support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: - WrapFunctionalOps = yaml.safe_load(f).get('functional') -for f in dir(torch.nn.functional): - if f != "__name__": - locals().update({f: getattr(torch.nn.functional, f)}) +def init_environment(): + cur_path = os.path.dirname(os.path.realpath(__file__)) + yaml_path = os.path.join(cur_path, "../hook_module/support_wrap_ops.yaml") + with open(yaml_path, 'r') as f: + WrapFunctionalOps = yaml.safe_load(f).get('functional') + for f in dir(torch.nn.functional): + if f != "__name__": + locals().update({f: getattr(torch.nn.functional, f)}) + + +init_environment() def exec_api(api_type, api_name, args, kwargs): @@ -87,14 +91,7 @@ def run_ut(forward_file, backward_file, out_path, save_error_data): def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): [api_type, api_name, _] = api_full_name.split("*") - convert_type, api_info_dict = api_info_preprocess(api_name, api_info_dict) - need_grad = True - if api_info_dict.get("kwargs") and "out" in api_info_dict.get("kwargs"): - need_grad = False - if api_name[-1] == "_" or api_name in NO_GRAD_APIS: - need_grad = False - args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type) - inplace = kwargs.get("inplace") if kwargs.get("inplace") else None + args, inplace, kwargs, need_grad = get_api_info(api_info_dict, api_name) need_backward = api_full_name in backward_content and api_name[-1] != "_" and inplace is not True need_backward = need_backward and need_grad if inplace or not need_grad: @@ -111,35 +108,51 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di grad_index = grad_input_index.get('grad_index') if need_backward: - backward_args = backward_content[api_full_name] - grad = gen_args(backward_args)[0] - if grad_index is not None: - out[grad_index].backward(grad) - elif isinstance(out, (list, tuple)): - raise NotImplementedError("Multiple backward is not supported.") - else: - out.backward(grad) - args_grad = [] - for arg in args: - if isinstance(arg, torch.Tensor): - args_grad.append(arg.grad) - grad_out = args_grad - - npu_grad = grad.clone().detach().npu() - if grad_index is not None: - npu_out[grad_index].backward(npu_grad) - else: - npu_out.backward(npu_grad) - npu_args_grad = [] - for arg in npu_args: - if isinstance(arg, torch.Tensor): - npu_args_grad.append(arg.grad) - npu_grad_out = npu_args_grad + grad_out, npu_grad_out = run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out) if grad_index is not None: return grad_out, npu_grad_out, npu_out[grad_index], out[grad_index] return grad_out, npu_grad_out, npu_out, out +def get_api_info(api_info_dict, api_name): + convert_type, api_info_dict = api_info_preprocess(api_name, api_info_dict) + need_grad = True + if api_info_dict.get("kwargs") and "out" in api_info_dict.get("kwargs"): + need_grad = False + if api_name[-1] == "_" or api_name in NO_GRAD_APIS: + need_grad = False + args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type) + inplace = kwargs.get("inplace") if kwargs.get("inplace") else None + return args, inplace, kwargs, need_grad + + +def run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out): + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + npu_grad = grad.clone().detach().npu() + if grad_index is not None: + npu_out[grad_index].backward(npu_grad) + else: + npu_out.backward(npu_grad) + npu_args_grad = [] + for arg in npu_args: + if isinstance(arg, torch.Tensor): + npu_args_grad.append(arg.grad) + npu_grad_out = npu_args_grad + return grad_out, npu_grad_out + + def _run_ut_parser(parser): parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", help=" The api param tool forward result file: generate from api param tool, "