diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py index 07dd4e6bfca90336b4d9c3c4f1deaed9aded17b6..e9882fa1f726fcc731c15cde04a49cc9022761fd 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py @@ -1,5 +1,6 @@ import yaml import os +import re from api_accuracy_checker.common.utils import check_file_or_directory_path class Config: @@ -19,12 +20,17 @@ class Config: 'dump_step': int, 'error_data_path': str, 'enable_dataloader': bool, - 'target_iter': int + 'target_iter': int, + 'url': str, + 'gpu_id': int, + 'npu_id': int } if not isinstance(value, validators[key]): raise ValueError(f"{key} must be {validators[key].__name__} type") if key == 'target_iter' and value < 0: raise ValueError("target_iter must be greater than 0") + if key == 'url' and re.match(r'^http:\/\/(?:[0-9]{1,3}\.){3}[0-9]{1,3}(:[0-9]{1,5})$', value) is None: + raise ValueError("The format of the url is wrong") return value def __getattr__(self, item): diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 9304a9ca4fe0bbeabcbb6bea6fb97fad61c42163..59ce6b6f83917fa5776c52dab9792a9df447ef98 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -17,6 +17,7 @@ import collections import json import os +import pickle import random import re import stat @@ -100,6 +101,7 @@ class Const: "int32_to_int64": ["cross_entropy"] } + class CompareConst: """ Class for compare module const @@ -601,6 +603,7 @@ def cross_entropy_process(api_info_dict): api_info_dict['args'][1]['Min'] = 0 #The second argument in cross_entropy should be -100 or not less than 0. return api_info_dict + def initialize_save_path(save_path, dir_name): data_path = os.path.join(save_path, dir_name) if os.path.exists(data_path): @@ -609,9 +612,22 @@ def initialize_save_path(save_path, dir_name): os.mkdir(data_path, mode = 0o750) check_file_or_directory_path(data_path, True) + def write_pt(file_path, tensor): if os.path.exists(file_path): raise ValueError(f"File {file_path} already exists") torch.save(tensor, file_path) full_path = os.path.abspath(file_path) - return full_path \ No newline at end of file + return full_path + + +def load_file(file): + if not file.exists(): + raise FileNotFoundError + data = None + if file.suffix == ".pt": + data = torch.load(file) + elif file.suffix == ".p" or file.suffix == ".pickle": + with open(file, 'rb') as f: + data = pickle.load(f) + return data diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml index 46f0ed8d41af82c43154263d940678321f57b814..beb31b28379ac676656301f6c2f58bd8aff78da2 100644 --- a/debug/accuracy_tools/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml @@ -6,4 +6,7 @@ real_data: False dump_step: 1000 error_data_path: './' enable_dataloader: True -target_iter: 1 \ No newline at end of file +target_iter: 1 +url: 'http://xx.xx.xx.xx:xxxx' +gpu_id: 0 +npu_id: 0 \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py new file mode 100644 index 0000000000000000000000000000000000000000..564a9d7776bcd4ae176821269d5851df8b042648 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -0,0 +1,36 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + +import tempfile + +import uvicorn +from fastapi import Form, FastAPI, UploadFile, File +from fastapi.responses import PlainTextResponse + +from api_accuracy_checker.gpu.service import load_and_run_api, accept_store_temp +from api_accuracy_checker.common.utils import print_info_log, print_error_log + +app = FastAPI() + + +@app.post("/run-api") +def run_api_by_file(api_name: str = Form(description="api name"), up_input_file: UploadFile = File(), + gpu_id: int = Form(default=0, description="gpu id")): + with tempfile.TemporaryDirectory() as temp_dir: + try: + _input_file = accept_store_temp(temp_dir, [up_input_file])[0] + print_info_log(f"Test {api_name}: input: {_input_file}") + res = load_and_run_api(_input_file, f"cuda:{gpu_id}") + except Exception as e: + print_error_log("Http 500 Internal Server Error.") + return PlainTextResponse(content=res) + + +@app.get("/hello") +def hello_world(): + return {"result": "hello"} + + +if __name__ == '__main__': + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py new file mode 100644 index 0000000000000000000000000000000000000000..35f0e17790b68433c53e9fa993600e4103cdb384 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -0,0 +1,53 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + +import pickle + +import torch +from pathlib import Path + +import api_accuracy_checker.common.utils as utils +from api_accuracy_checker.run_ut.run_ut import exec_api, generate_device_params + + +def accept_store_temp(temp_dir, file_list): + file_path_list = [] + for index, file in enumerate(file_list): + _file = Path(temp_dir).joinpath(f"{index}_{file.filename}").resolve() + with open(_file, 'wb') as fw: + fw.write(file.file.read()) + file_path_list.append(_file) + return file_path_list + + +def load_and_run_api(input_file, device): + data = utils.load_file(input_file) + gpu_args, gpu_kwargs = generate_device_params(data['args'], data['kwargs'], True, device) + gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) + out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) + res = {'out': out, 'grad_out': grad_out} + return pickle.dumps(res, protocol=4) + + +def run_gpu(api_type, api_name, gpu_args, gpu_kwargs, grad, grad_index): + out = exec_api(api_type, api_name, gpu_args, gpu_kwargs) + grad_out = None + if grad is not None: + grad_out = run_backward_gpu(gpu_args, grad, grad_index, out) + return out, grad_out + + +def run_backward_gpu(args, grad, grad_index, out): + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + return grad_out diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 26c2f1c46018ce1f4c3069910e46af22138d3c06..ad0131ff9f1fc60a43755bd7611e7f79c66b84fe 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,8 +1,21 @@ import argparse +import hashlib +import json import os import copy +import pickle import sys -import torch_npu +from urllib.parse import urljoin + +import requests +from requests import RequestException + +try: + import torch_npu +except ImportError: + IS_GPU = True +else: + IS_GPU = False import yaml import torch from tqdm import tqdm @@ -13,7 +26,7 @@ from api_accuracy_checker.compare.compare import Comparator from api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate -from ut_api_info import UtAPIInfo +from api_accuracy_checker.run_ut.ut_api_info import UtAPIInfo from api_accuracy_checker.common.config import msCheckerConfig NO_GRAD_APIS = ["hardtanh"] @@ -45,25 +58,26 @@ def exec_api(api_type, api_name, args, kwargs): return out -def generate_npu_params(input_args, input_kwargs, need_backward): - def recursive_arg_to_npu(arg_in): +def generate_device_params(input_args, input_kwargs, need_backward, device_id): + def recursive_arg_to_device(arg_in): if isinstance(arg_in, (list, tuple)): - return type(arg_in)(recursive_arg_to_npu(arg) for arg in arg_in) + return type(arg_in)(recursive_arg_to_device(arg) for arg in arg_in) elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: - arg_in = arg_in.clone().detach().to("npu").requires_grad_() + arg_in = arg_in.clone().detach().to(device_id).requires_grad_() temp_arg_in = arg_in * 1 arg_in = temp_arg_in.type_as(arg_in) arg_in.retain_grad() return arg_in else: - return arg_in.clone().detach().to("npu") + return arg_in.clone().detach().to(device_id) else: return arg_in - npu_args = recursive_arg_to_npu(input_args) - npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in input_kwargs.items()} - return npu_args, npu_kwargs + device_args = recursive_arg_to_device(input_args) + device_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} + return device_args, device_kwargs + def generate_cpu_params(input_args, input_kwargs, need_backward): def recursive_arg_to_cpu(arg_in): @@ -72,7 +86,8 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: if str(arg_in.dtype) in Const.RAISE_PRECISION.keys(): - arg_in = arg_in.clone().type(eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() + arg_in = arg_in.clone().type( + eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() else: arg_in = arg_in.clone().detach().requires_grad_() temp_arg_in = arg_in * 1 @@ -90,6 +105,7 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): cpu_kwargs = {key: recursive_arg_to_cpu(value) for key, value in input_kwargs.items()} return cpu_args, cpu_kwargs + def run_ut(forward_file, backward_file, out_path, save_error_data): print_info_log("start UT test") forward_content = get_json_contents(forward_file) @@ -129,7 +145,6 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) UtAPIInfo(api_full_name + '.backward.output.npu', data_info.npu_grad_out) - def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): in_fwd_data_list = [] [api_type, api_name, _] = api_full_name.split("*") @@ -141,24 +156,56 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if inplace or not need_grad: print_warn_log("%s involves in-place operations, skip backward" % api_full_name) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward) - npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) - grad_out, npu_grad_out = None, None - if kwargs.get("device"): - del kwargs["device"] - out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) - npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, f"npu:{msCheckerConfig.npu_id}") grad_input_index = api_setting_dict.get(api_name) grad_index = None - grad = None + grad, cpu_grad = None, None if grad_input_index is not None: - grad_index = grad_input_index.get('grad_index') + grad_index = grad_input_index.get("grad_index") + if need_backward: + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + cpu_grad, _ = generate_cpu_params(grad, {}, False) + + transfer_input_data = {"args": cpu_args, "kwargs": cpu_kwargs, "grad": cpu_grad, "grad_index": grad_index, + "api_type": api_type, "api_name": api_name} + store_file = store_testcase(transfer_input_data, "inputs") + grad_out, npu_grad_out = None, None + if kwargs.get("device"): + del kwargs["device"] + # run torch api in npu + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) if need_backward: - grad_out, npu_grad_out, grad, npu_grad = run_backward(api_full_name, cpu_args, backward_content, grad_index, npu_args, - npu_out, out) - if grad_index is not None: - return UtDataInfo(grad_out, npu_grad_out, npu_out[grad_index], out[grad_index], grad, in_fwd_data_list) - return UtDataInfo(grad_out, npu_grad_out, npu_out, out, grad, in_fwd_data_list) + npu_grad_out = run_backward_npu(grad, grad_index, npu_args, npu_out) + + # run torch api in remote gpu + gpu_out, gpu_grad_out = run_gpu_remote(api_full_name, store_file) + os.remove(store_file) + + return UtDataInfo(gpu_grad_out, npu_grad_out, npu_out, gpu_out, grad, in_fwd_data_list) + + +def store_testcase(input_data, store_path): + fname = hashlib.sha1(str(input_data).encode('utf-8')).hexdigest() + '.p' + store_file = os.path.join(store_path, fname) + with open(store_file, 'wb') as f: + pickle.dump(input_data, f) + return store_file + + +def run_gpu_remote(api_full_name, input_file): + with open(input_file, 'rb') as file1: + files = {"up_input_file": file1} + data = {"api_name": api_full_name, "gpu_id": msCheckerConfig.gpu_id} + print_info_log(f"start upload {input_file}") + try: + result = requests.post(url=urljoin(msCheckerConfig.url, "run-api"), files=files, data=data).content + res = pickle.loads(result) + gpu_out, gpu_grad_out = res['out'], res['grad_out'] + return gpu_out, gpu_grad_out + except RequestException: + print_error_log("Can't get remote result.") def get_api_info(api_info_dict, api_name): @@ -173,21 +220,7 @@ def get_api_info(api_info_dict, api_name): return args, inplace, kwargs, need_grad -def run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out): - backward_args = backward_content[api_full_name] - grad = gen_args(backward_args)[0] - cpu_grad, _ = generate_cpu_params(grad, {}, False) - if grad_index is not None: - out[grad_index].backward(cpu_grad) - elif isinstance(out, (list, tuple)): - raise NotImplementedError("Multiple backward is not supported.") - else: - out.backward(cpu_grad) - args_grad = [] - for arg in args: - if isinstance(arg, torch.Tensor): - args_grad.append(arg.grad) - grad_out = args_grad +def run_backward_npu(grad, grad_index, npu_args, npu_out): npu_grad = grad.clone().detach().npu() if grad_index is not None: npu_out[grad_index].backward(npu_grad) @@ -198,7 +231,7 @@ def run_backward(api_full_name, args, backward_content, grad_index, npu_args, np if isinstance(arg, torch.Tensor): npu_args_grad.append(arg.grad) npu_grad_out = npu_args_grad - return grad_out, npu_grad_out, grad, npu_grad + return npu_grad_out def initialize_save_error_data(): @@ -258,6 +291,7 @@ class UtDataInfo: self.grad_in = grad_in self.in_fwd_data_list = in_fwd_data_list + if __name__ == '__main__': _run_ut() print_info_log("UT task completed.")