From 6f87cb59b3403201e03d4d0395f7e37ffdd5ff36 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Mon, 11 Sep 2023 21:55:50 +0800 Subject: [PATCH 01/15] =?UTF-8?q?=E9=A2=84=E6=A3=80=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=92=8Cgpu=E8=BF=9B=E8=A1=8C=E6=AF=94?= =?UTF-8?q?=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/utils.py | 21 +++- .../api_accuracy_checker/gpu/server.py | 32 +++++ .../api_accuracy_checker/gpu/service.py | 50 ++++++++ .../api_accuracy_checker/run_ut/run_ut.py | 110 +++++++++++------- 4 files changed, 172 insertions(+), 41 deletions(-) create mode 100644 debug/accuracy_tools/api_accuracy_checker/gpu/server.py create mode 100644 debug/accuracy_tools/api_accuracy_checker/gpu/service.py diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 94907f01493..0ee04cd58e5 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -17,6 +17,7 @@ import collections import json import os +import pickle import random import re import stat @@ -93,6 +94,10 @@ class Const: "int32_to_int64": ["cross_entropy"] } + # gpu remote + URL: str = "http://10.175.118.91:8081" + GPU_ID: int = 1 + class CompareConst: """ Class for compare module const @@ -594,6 +599,7 @@ def cross_entropy_process(api_info_dict): api_info_dict['args'][1]['Min'] = 0 #The second argument in cross_entropy should be -100 or not less than 0. return api_info_dict + def initialize_save_path(save_path, dir_name): data_path = os.path.join(save_path, dir_name) if os.path.exists(data_path): @@ -602,9 +608,22 @@ def initialize_save_path(save_path, dir_name): os.mkdir(data_path, mode = 0o750) check_file_or_directory_path(data_path, True) + def write_pt(file_path, tensor): if os.path.exists(file_path): raise ValueError(f"File {file_path} already exists") torch.save(tensor, file_path) full_path = os.path.abspath(file_path) - return full_path \ No newline at end of file + return full_path + + +def load_file(file): + if not file.exists(): + raise FileNotFoundError + data = None + if file.suffix == ".pt": + data = torch.load(file) + elif file.suffix == ".p" or file.suffix == ".pickle": + with open(file, 'rb') as f: + data = pickle.load(f) + return data diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py new file mode 100644 index 00000000000..5f20ecb1561 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -0,0 +1,32 @@ +import tempfile + +import uvicorn +from fastapi import Form, FastAPI, UploadFile, File, status +from fastapi.responses import JSONResponse, PlainTextResponse + +from api_accuracy_checker.gpu.service import load_and_run_api, accept_store_temp +from api_accuracy_checker.common.utils import print_info_log, print_error_log + +app = FastAPI() + + +@app.post("/run-api") +def run_api_by_file(api_name, up_input_file, gpu_id): + with tempfile.TemporaryDirectory() as temp_dir: + try: + _input_file = accept_store_temp(temp_dir, [up_input_file]) + print_info_log.info(f"Test {api_name}: input: {_input_file}") + res = load_and_run_api(_input_file, f"cuda:{gpu_id}") + except Exception as e: + print_error_log(e) + return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": str(e)}) + return PlainTextResponse(content=res) + + +@app.get("/hello") +def hello_world(): + return {"result": "hello"} + + +if __name__ == '__main__': + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py new file mode 100644 index 00000000000..849a0c7f668 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -0,0 +1,50 @@ +import pickle + +import torch +from pathlib import Path + +import api_accuracy_checker.common.utils as utils +from api_accuracy_checker.run_ut.run_ut import exec_api, generate_device_params + + +def accept_store_temp(temp_dir, file_list): + file_path_list = [] + for index, file in enumerate(file_list): + _file = Path(temp_dir).joinpath(f"{index}_{file.filename}").resolve() + with open(_file, 'wb') as fw: + fw.write(file.file.read()) + file_path_list.append(_file) + return file_path_list + + +def load_and_run_api(input_file, device): + data = utils.load_file(input_file) + gpu_args, gpu_kwargs = generate_device_params(data['args'], data['kwargs'], True, device) + gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) + out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) + out = out.cpu() + res = {'out': out, 'grad_out': grad_out} + return pickle.dumps(res) + + +def run_gpu(api_type, api_name, gpu_args, gpu_kwargs, grad, grad_index): + out = exec_api(api_type, api_name, gpu_args, gpu_kwargs) + grad_out = None + if grad is not None: + grad_out = run_backward_gpu(gpu_args, grad, grad_index, out) + return out, grad_out + + +def run_backward_gpu(args, grad, grad_index, out): + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + return grad_out diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 707d6cbed9d..312b9da2daf 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,8 +1,19 @@ import argparse +import hashlib +import json import os import copy +import pickle import sys -import torch_npu +from urllib.parse import urljoin + +import requests +try: + import torch_npu +except ImportError: + IS_GPU = True +else: + IS_GPU = False import yaml import torch from tqdm import tqdm @@ -13,7 +24,7 @@ from api_accuracy_checker.compare.compare import Comparator from api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate -from ut_api_info import UtAPIInfo +from api_accuracy_checker.run_ut.ut_api_info import UtAPIInfo from api_accuracy_checker.common.config import msCheckerConfig NO_GRAD_APIS = ["hardtanh"] @@ -45,26 +56,27 @@ def exec_api(api_type, api_name, args, kwargs): return out -def generate_npu_params(input_args, input_kwargs, need_backward): - def recursive_arg_to_npu(arg_in): +def generate_device_params(input_args, input_kwargs, need_backward, device_id): + def recursive_arg_to_device(arg_in): if isinstance(arg_in, (list, tuple)): - return type(arg_in)(recursive_arg_to_npu(arg) for arg in arg_in) + return type(arg_in)(recursive_arg_to_device(arg) for arg in arg_in) elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: - arg_in = arg_in.clone().detach().to("npu").requires_grad_() + arg_in = arg_in.clone().detach().to(device_id).requires_grad_() temp_arg_in = arg_in * 1 arg_in = temp_arg_in.type_as(arg_in) arg_in.retain_grad() return arg_in else: - return arg_in.clone().detach().to("npu") + return arg_in.clone().detach().to(device_id) else: return arg_in - npu_args = recursive_arg_to_npu(input_args) - npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in input_kwargs.items()} + npu_args = recursive_arg_to_device(input_args) + npu_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} return npu_args, npu_kwargs + def generate_cpu_params(input_args, input_kwargs, need_backward): def recursive_arg_to_cpu(arg_in): if isinstance(arg_in, (list, tuple)): @@ -72,7 +84,8 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: if str(arg_in.dtype) in Const.RAISE_PRECISION.keys(): - arg_in = arg_in.clone().type(eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() + arg_in = arg_in.clone().type( + eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() else: arg_in = arg_in.clone().detach().requires_grad_() temp_arg_in = arg_in * 1 @@ -90,6 +103,7 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): cpu_kwargs = {key: recursive_arg_to_cpu(value) for key, value in input_kwargs.items()} return cpu_args, cpu_kwargs + def run_ut(forward_file, backward_file, out_path, save_error_data): print_info_log("start UT test") forward_content = get_json_contents(forward_file) @@ -133,7 +147,6 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) UtAPIInfo(api_full_name + '.backward.output.npu', data_info.npu_grad_out) - def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): in_fwd_data_list = [] [api_type, api_name, _] = api_full_name.split("*") @@ -145,24 +158,54 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if inplace or not need_grad: print_warn_log("%s involves in-place operations, skip backward" % api_full_name) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward) - npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) - grad_out, npu_grad_out = None, None - if kwargs.get("device"): - del kwargs["device"] - out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) - npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, "npu:0") grad_input_index = api_setting_dict.get(api_name) grad_index = None - grad = None + grad, cpu_grad = None, None if grad_input_index is not None: - grad_index = grad_input_index.get('grad_index') + grad_index = grad_input_index.get("grad_index") + if need_backward: + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + cpu_grad, _ = generate_cpu_params(grad, {}, False) + + transfer_input_data = {"args": cpu_args, "kwargs": cpu_kwargs, "grad": cpu_grad, "grad_index": grad_index, + "api_type": api_type, "api_name": api_name} + store_file = store_testcase(transfer_input_data, "inputs") + grad_out, npu_grad_out = None, None + if kwargs.get("device"): + del kwargs["device"] + # run torch api in npu + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) if need_backward: - grad_out, npu_grad_out, grad, npu_grad = run_backward(api_full_name, cpu_args, backward_content, grad_index, npu_args, - npu_out, out) - if grad_index is not None: - return UtDataInfo(grad_out, npu_grad_out, npu_out[grad_index], out[grad_index], grad, in_fwd_data_list) - return UtDataInfo(grad_out, npu_grad_out, npu_out, out, grad, in_fwd_data_list) + npu_grad_out = run_backward_npu(grad, grad_index, npu_args, npu_out) + + gpu_out, gpu_grad_out = run_gpu_remote(api_full_name, store_file) + + return UtDataInfo(gpu_grad_out, npu_grad_out, npu_out, gpu_out, grad, in_fwd_data_list) + + +def store_testcase(input_data, store_path): + fname = hashlib.sha1(str(input_data).encode('utf-8')).hexdigest() + '.p' + store_file = os.path.join(store_path, fname) + with open(store_file, 'wb') as f: + pickle.dump(input_data, f) + return store_file + + +def run_gpu_remote(api_full_name, input_file): + with open(input_file, 'rb') as file1: + files = {"up_input_file": file1} + data = {"api_name": api_full_name, "gpu_id": Const.GPU_ID} + print_info_log(f"start upload {input_file}") + try: + result = requests.post(url=urljoin(Const.URL, "run-api"), files=files, data=data).content + res = pickle.loads(result) + gpu_out, gpu_grad_out = res['out'], res['grad_out'] + return gpu_out, gpu_grad_out + except Exception as e: + print_error_log("Can't get remote result") def get_api_info(api_info_dict, api_name): @@ -177,21 +220,7 @@ def get_api_info(api_info_dict, api_name): return args, inplace, kwargs, need_grad -def run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out): - backward_args = backward_content[api_full_name] - grad = gen_args(backward_args)[0] - cpu_grad, _ = generate_cpu_params(grad, {}, False) - if grad_index is not None: - out[grad_index].backward(cpu_grad) - elif isinstance(out, (list, tuple)): - raise NotImplementedError("Multiple backward is not supported.") - else: - out.backward(cpu_grad) - args_grad = [] - for arg in args: - if isinstance(arg, torch.Tensor): - args_grad.append(arg.grad) - grad_out = args_grad +def run_backward_npu(grad, grad_index, npu_args, npu_out): npu_grad = grad.clone().detach().npu() if grad_index is not None: npu_out[grad_index].backward(npu_grad) @@ -202,7 +231,7 @@ def run_backward(api_full_name, args, backward_content, grad_index, npu_args, np if isinstance(arg, torch.Tensor): npu_args_grad.append(arg.grad) npu_grad_out = npu_args_grad - return grad_out, npu_grad_out, grad, npu_grad + return npu_grad_out def initialize_save_error_data(): @@ -262,6 +291,7 @@ class UtDataInfo: self.grad_in = grad_in self.in_fwd_data_list = in_fwd_data_list + if __name__ == '__main__': _run_ut() print_info_log("UT task completed.") -- Gitee From e0e505f0ffc7306dcad8356e4ed8b49e6ab83574 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 11:28:18 +0800 Subject: [PATCH 02/15] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dgpu=E6=AF=94=E5=AF=B9?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/server.py | 3 +-- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 1 - debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py index 5f20ecb1561..c9022e379f3 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -18,8 +18,7 @@ def run_api_by_file(api_name, up_input_file, gpu_id): print_info_log.info(f"Test {api_name}: input: {_input_file}") res = load_and_run_api(_input_file, f"cuda:{gpu_id}") except Exception as e: - print_error_log(e) - return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": str(e)}) + print_error_log("Http 500 Internal Server Error.") return PlainTextResponse(content=res) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index 849a0c7f668..10832577af6 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -22,7 +22,6 @@ def load_and_run_api(input_file, device): gpu_args, gpu_kwargs = generate_device_params(data['args'], data['kwargs'], True, device) gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) - out = out.cpu() res = {'out': out, 'grad_out': grad_out} return pickle.dumps(res) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 312b9da2daf..591135d8531 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -205,7 +205,7 @@ def run_gpu_remote(api_full_name, input_file): gpu_out, gpu_grad_out = res['out'], res['grad_out'] return gpu_out, gpu_grad_out except Exception as e: - print_error_log("Can't get remote result") + print_error_log("Can't get remote result.") def get_api_info(api_info_dict, api_name): -- Gitee From 420de20a840246afc34497e110c59d7f4e3d784f Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 11:59:45 +0800 Subject: [PATCH 03/15] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E8=A7=A3=E9=87=8A=E5=99=A8=EF=BC=8C=E7=BC=96=E7=A0=81=E5=92=8C?= =?UTF-8?q?=E7=89=88=E6=9D=83=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/server.py | 8 ++++++-- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py index c9022e379f3..5d21ab1c888 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -1,3 +1,7 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + import tempfile import uvicorn @@ -14,8 +18,8 @@ app = FastAPI() def run_api_by_file(api_name, up_input_file, gpu_id): with tempfile.TemporaryDirectory() as temp_dir: try: - _input_file = accept_store_temp(temp_dir, [up_input_file]) - print_info_log.info(f"Test {api_name}: input: {_input_file}") + _input_file = accept_store_temp(temp_dir, [up_input_file])[0] + print_info_log(f"Test {api_name}: input: {_input_file}") res = load_and_run_api(_input_file, f"cuda:{gpu_id}") except Exception as e: print_error_log("Http 500 Internal Server Error.") diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index 10832577af6..e23e27791ff 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -1,3 +1,7 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + import pickle import torch -- Gitee From e59dbbd58456645ba8d3cb67f76319a8c039e758 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 14:32:17 +0800 Subject: [PATCH 04/15] =?UTF-8?q?=E4=BF=AE=E6=94=B9dumps=E5=8D=8F=E8=AE=AE?= =?UTF-8?q?=EF=BC=8C=E8=83=BD=E5=A4=9Fdump=E5=A4=A7=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 2 +- debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index e23e27791ff..35f0e17790b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -27,7 +27,7 @@ def load_and_run_api(input_file, device): gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) res = {'out': out, 'grad_out': grad_out} - return pickle.dumps(res) + return pickle.dumps(res, protocol=4) def run_gpu(api_type, api_name, gpu_args, gpu_kwargs, grad, grad_index): diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 591135d8531..a6134b2ff85 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -181,7 +181,9 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if need_backward: npu_grad_out = run_backward_npu(grad, grad_index, npu_args, npu_out) + # run torch api in remote gpu gpu_out, gpu_grad_out = run_gpu_remote(api_full_name, store_file) + os.remove(store_file) return UtDataInfo(gpu_grad_out, npu_grad_out, npu_out, gpu_out, grad, in_fwd_data_list) -- Gitee From b31fc0b4f6c37c5f2fd40bc2c9f44a41b1d76d42 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Mon, 11 Sep 2023 21:55:50 +0800 Subject: [PATCH 05/15] =?UTF-8?q?=E9=A2=84=E6=A3=80=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=92=8Cgpu=E8=BF=9B=E8=A1=8C=E6=AF=94?= =?UTF-8?q?=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/utils.py | 21 +++- .../api_accuracy_checker/gpu/server.py | 32 +++++ .../api_accuracy_checker/gpu/service.py | 50 ++++++++ .../api_accuracy_checker/run_ut/run_ut.py | 110 +++++++++++------- 4 files changed, 172 insertions(+), 41 deletions(-) create mode 100644 debug/accuracy_tools/api_accuracy_checker/gpu/server.py create mode 100644 debug/accuracy_tools/api_accuracy_checker/gpu/service.py diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 9304a9ca4fe..04bd565ca8e 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -17,6 +17,7 @@ import collections import json import os +import pickle import random import re import stat @@ -100,6 +101,10 @@ class Const: "int32_to_int64": ["cross_entropy"] } + # gpu remote + URL: str = "http://10.175.118.91:8081" + GPU_ID: int = 1 + class CompareConst: """ Class for compare module const @@ -601,6 +606,7 @@ def cross_entropy_process(api_info_dict): api_info_dict['args'][1]['Min'] = 0 #The second argument in cross_entropy should be -100 or not less than 0. return api_info_dict + def initialize_save_path(save_path, dir_name): data_path = os.path.join(save_path, dir_name) if os.path.exists(data_path): @@ -609,9 +615,22 @@ def initialize_save_path(save_path, dir_name): os.mkdir(data_path, mode = 0o750) check_file_or_directory_path(data_path, True) + def write_pt(file_path, tensor): if os.path.exists(file_path): raise ValueError(f"File {file_path} already exists") torch.save(tensor, file_path) full_path = os.path.abspath(file_path) - return full_path \ No newline at end of file + return full_path + + +def load_file(file): + if not file.exists(): + raise FileNotFoundError + data = None + if file.suffix == ".pt": + data = torch.load(file) + elif file.suffix == ".p" or file.suffix == ".pickle": + with open(file, 'rb') as f: + data = pickle.load(f) + return data diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py new file mode 100644 index 00000000000..5f20ecb1561 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -0,0 +1,32 @@ +import tempfile + +import uvicorn +from fastapi import Form, FastAPI, UploadFile, File, status +from fastapi.responses import JSONResponse, PlainTextResponse + +from api_accuracy_checker.gpu.service import load_and_run_api, accept_store_temp +from api_accuracy_checker.common.utils import print_info_log, print_error_log + +app = FastAPI() + + +@app.post("/run-api") +def run_api_by_file(api_name, up_input_file, gpu_id): + with tempfile.TemporaryDirectory() as temp_dir: + try: + _input_file = accept_store_temp(temp_dir, [up_input_file]) + print_info_log.info(f"Test {api_name}: input: {_input_file}") + res = load_and_run_api(_input_file, f"cuda:{gpu_id}") + except Exception as e: + print_error_log(e) + return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": str(e)}) + return PlainTextResponse(content=res) + + +@app.get("/hello") +def hello_world(): + return {"result": "hello"} + + +if __name__ == '__main__': + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py new file mode 100644 index 00000000000..849a0c7f668 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -0,0 +1,50 @@ +import pickle + +import torch +from pathlib import Path + +import api_accuracy_checker.common.utils as utils +from api_accuracy_checker.run_ut.run_ut import exec_api, generate_device_params + + +def accept_store_temp(temp_dir, file_list): + file_path_list = [] + for index, file in enumerate(file_list): + _file = Path(temp_dir).joinpath(f"{index}_{file.filename}").resolve() + with open(_file, 'wb') as fw: + fw.write(file.file.read()) + file_path_list.append(_file) + return file_path_list + + +def load_and_run_api(input_file, device): + data = utils.load_file(input_file) + gpu_args, gpu_kwargs = generate_device_params(data['args'], data['kwargs'], True, device) + gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) + out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) + out = out.cpu() + res = {'out': out, 'grad_out': grad_out} + return pickle.dumps(res) + + +def run_gpu(api_type, api_name, gpu_args, gpu_kwargs, grad, grad_index): + out = exec_api(api_type, api_name, gpu_args, gpu_kwargs) + grad_out = None + if grad is not None: + grad_out = run_backward_gpu(gpu_args, grad, grad_index, out) + return out, grad_out + + +def run_backward_gpu(args, grad, grad_index, out): + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + return grad_out diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 26c2f1c4601..5003e123e53 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,8 +1,19 @@ import argparse +import hashlib +import json import os import copy +import pickle import sys -import torch_npu +from urllib.parse import urljoin + +import requests +try: + import torch_npu +except ImportError: + IS_GPU = True +else: + IS_GPU = False import yaml import torch from tqdm import tqdm @@ -13,7 +24,7 @@ from api_accuracy_checker.compare.compare import Comparator from api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate -from ut_api_info import UtAPIInfo +from api_accuracy_checker.run_ut.ut_api_info import UtAPIInfo from api_accuracy_checker.common.config import msCheckerConfig NO_GRAD_APIS = ["hardtanh"] @@ -45,26 +56,27 @@ def exec_api(api_type, api_name, args, kwargs): return out -def generate_npu_params(input_args, input_kwargs, need_backward): - def recursive_arg_to_npu(arg_in): +def generate_device_params(input_args, input_kwargs, need_backward, device_id): + def recursive_arg_to_device(arg_in): if isinstance(arg_in, (list, tuple)): - return type(arg_in)(recursive_arg_to_npu(arg) for arg in arg_in) + return type(arg_in)(recursive_arg_to_device(arg) for arg in arg_in) elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: - arg_in = arg_in.clone().detach().to("npu").requires_grad_() + arg_in = arg_in.clone().detach().to(device_id).requires_grad_() temp_arg_in = arg_in * 1 arg_in = temp_arg_in.type_as(arg_in) arg_in.retain_grad() return arg_in else: - return arg_in.clone().detach().to("npu") + return arg_in.clone().detach().to(device_id) else: return arg_in - npu_args = recursive_arg_to_npu(input_args) - npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in input_kwargs.items()} + npu_args = recursive_arg_to_device(input_args) + npu_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} return npu_args, npu_kwargs + def generate_cpu_params(input_args, input_kwargs, need_backward): def recursive_arg_to_cpu(arg_in): if isinstance(arg_in, (list, tuple)): @@ -72,7 +84,8 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: if str(arg_in.dtype) in Const.RAISE_PRECISION.keys(): - arg_in = arg_in.clone().type(eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() + arg_in = arg_in.clone().type( + eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() else: arg_in = arg_in.clone().detach().requires_grad_() temp_arg_in = arg_in * 1 @@ -90,6 +103,7 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): cpu_kwargs = {key: recursive_arg_to_cpu(value) for key, value in input_kwargs.items()} return cpu_args, cpu_kwargs + def run_ut(forward_file, backward_file, out_path, save_error_data): print_info_log("start UT test") forward_content = get_json_contents(forward_file) @@ -129,7 +143,6 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) UtAPIInfo(api_full_name + '.backward.output.npu', data_info.npu_grad_out) - def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): in_fwd_data_list = [] [api_type, api_name, _] = api_full_name.split("*") @@ -141,24 +154,54 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if inplace or not need_grad: print_warn_log("%s involves in-place operations, skip backward" % api_full_name) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward) - npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) - grad_out, npu_grad_out = None, None - if kwargs.get("device"): - del kwargs["device"] - out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) - npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, "npu:0") grad_input_index = api_setting_dict.get(api_name) grad_index = None - grad = None + grad, cpu_grad = None, None if grad_input_index is not None: - grad_index = grad_input_index.get('grad_index') + grad_index = grad_input_index.get("grad_index") + if need_backward: + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + cpu_grad, _ = generate_cpu_params(grad, {}, False) + + transfer_input_data = {"args": cpu_args, "kwargs": cpu_kwargs, "grad": cpu_grad, "grad_index": grad_index, + "api_type": api_type, "api_name": api_name} + store_file = store_testcase(transfer_input_data, "inputs") + grad_out, npu_grad_out = None, None + if kwargs.get("device"): + del kwargs["device"] + # run torch api in npu + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) if need_backward: - grad_out, npu_grad_out, grad, npu_grad = run_backward(api_full_name, cpu_args, backward_content, grad_index, npu_args, - npu_out, out) - if grad_index is not None: - return UtDataInfo(grad_out, npu_grad_out, npu_out[grad_index], out[grad_index], grad, in_fwd_data_list) - return UtDataInfo(grad_out, npu_grad_out, npu_out, out, grad, in_fwd_data_list) + npu_grad_out = run_backward_npu(grad, grad_index, npu_args, npu_out) + + gpu_out, gpu_grad_out = run_gpu_remote(api_full_name, store_file) + + return UtDataInfo(gpu_grad_out, npu_grad_out, npu_out, gpu_out, grad, in_fwd_data_list) + + +def store_testcase(input_data, store_path): + fname = hashlib.sha1(str(input_data).encode('utf-8')).hexdigest() + '.p' + store_file = os.path.join(store_path, fname) + with open(store_file, 'wb') as f: + pickle.dump(input_data, f) + return store_file + + +def run_gpu_remote(api_full_name, input_file): + with open(input_file, 'rb') as file1: + files = {"up_input_file": file1} + data = {"api_name": api_full_name, "gpu_id": Const.GPU_ID} + print_info_log(f"start upload {input_file}") + try: + result = requests.post(url=urljoin(Const.URL, "run-api"), files=files, data=data).content + res = pickle.loads(result) + gpu_out, gpu_grad_out = res['out'], res['grad_out'] + return gpu_out, gpu_grad_out + except Exception as e: + print_error_log("Can't get remote result") def get_api_info(api_info_dict, api_name): @@ -173,21 +216,7 @@ def get_api_info(api_info_dict, api_name): return args, inplace, kwargs, need_grad -def run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out): - backward_args = backward_content[api_full_name] - grad = gen_args(backward_args)[0] - cpu_grad, _ = generate_cpu_params(grad, {}, False) - if grad_index is not None: - out[grad_index].backward(cpu_grad) - elif isinstance(out, (list, tuple)): - raise NotImplementedError("Multiple backward is not supported.") - else: - out.backward(cpu_grad) - args_grad = [] - for arg in args: - if isinstance(arg, torch.Tensor): - args_grad.append(arg.grad) - grad_out = args_grad +def run_backward_npu(grad, grad_index, npu_args, npu_out): npu_grad = grad.clone().detach().npu() if grad_index is not None: npu_out[grad_index].backward(npu_grad) @@ -198,7 +227,7 @@ def run_backward(api_full_name, args, backward_content, grad_index, npu_args, np if isinstance(arg, torch.Tensor): npu_args_grad.append(arg.grad) npu_grad_out = npu_args_grad - return grad_out, npu_grad_out, grad, npu_grad + return npu_grad_out def initialize_save_error_data(): @@ -258,6 +287,7 @@ class UtDataInfo: self.grad_in = grad_in self.in_fwd_data_list = in_fwd_data_list + if __name__ == '__main__': _run_ut() print_info_log("UT task completed.") -- Gitee From c52ad360bd083bc41a53f1992dbc38f44aca545a Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 11:28:18 +0800 Subject: [PATCH 06/15] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dgpu=E6=AF=94=E5=AF=B9?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/server.py | 3 +-- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 1 - debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py index 5f20ecb1561..c9022e379f3 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -18,8 +18,7 @@ def run_api_by_file(api_name, up_input_file, gpu_id): print_info_log.info(f"Test {api_name}: input: {_input_file}") res = load_and_run_api(_input_file, f"cuda:{gpu_id}") except Exception as e: - print_error_log(e) - return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": str(e)}) + print_error_log("Http 500 Internal Server Error.") return PlainTextResponse(content=res) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index 849a0c7f668..10832577af6 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -22,7 +22,6 @@ def load_and_run_api(input_file, device): gpu_args, gpu_kwargs = generate_device_params(data['args'], data['kwargs'], True, device) gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) - out = out.cpu() res = {'out': out, 'grad_out': grad_out} return pickle.dumps(res) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 5003e123e53..a8a74d79c49 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -201,7 +201,7 @@ def run_gpu_remote(api_full_name, input_file): gpu_out, gpu_grad_out = res['out'], res['grad_out'] return gpu_out, gpu_grad_out except Exception as e: - print_error_log("Can't get remote result") + print_error_log("Can't get remote result.") def get_api_info(api_info_dict, api_name): -- Gitee From 218183019cef3086fa9e137d1512ccabff12a0a5 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 11:59:45 +0800 Subject: [PATCH 07/15] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E8=A7=A3=E9=87=8A=E5=99=A8=EF=BC=8C=E7=BC=96=E7=A0=81=E5=92=8C?= =?UTF-8?q?=E7=89=88=E6=9D=83=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/server.py | 8 ++++++-- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py index c9022e379f3..5d21ab1c888 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -1,3 +1,7 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + import tempfile import uvicorn @@ -14,8 +18,8 @@ app = FastAPI() def run_api_by_file(api_name, up_input_file, gpu_id): with tempfile.TemporaryDirectory() as temp_dir: try: - _input_file = accept_store_temp(temp_dir, [up_input_file]) - print_info_log.info(f"Test {api_name}: input: {_input_file}") + _input_file = accept_store_temp(temp_dir, [up_input_file])[0] + print_info_log(f"Test {api_name}: input: {_input_file}") res = load_and_run_api(_input_file, f"cuda:{gpu_id}") except Exception as e: print_error_log("Http 500 Internal Server Error.") diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index 10832577af6..e23e27791ff 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -1,3 +1,7 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + import pickle import torch -- Gitee From 9f0886ca9aed57c6947a19237f24d5be2bf3b202 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 14:32:17 +0800 Subject: [PATCH 08/15] =?UTF-8?q?=E4=BF=AE=E6=94=B9dumps=E5=8D=8F=E8=AE=AE?= =?UTF-8?q?=EF=BC=8C=E8=83=BD=E5=A4=9Fdump=E5=A4=A7=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 2 +- debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index e23e27791ff..35f0e17790b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -27,7 +27,7 @@ def load_and_run_api(input_file, device): gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) res = {'out': out, 'grad_out': grad_out} - return pickle.dumps(res) + return pickle.dumps(res, protocol=4) def run_gpu(api_type, api_name, gpu_args, gpu_kwargs, grad, grad_index): diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index a8a74d79c49..7ab5eb8296c 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -177,7 +177,9 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if need_backward: npu_grad_out = run_backward_npu(grad, grad_index, npu_args, npu_out) + # run torch api in remote gpu gpu_out, gpu_grad_out = run_gpu_remote(api_full_name, store_file) + os.remove(store_file) return UtDataInfo(gpu_grad_out, npu_grad_out, npu_out, gpu_out, grad, in_fwd_data_list) -- Gitee From 6f497c7f456215efaa5012ff5289c6f36a06be46 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 17:31:02 +0800 Subject: [PATCH 09/15] =?UTF-8?q?gpu=E9=85=8D=E7=BD=AE=E7=A7=BB=E8=87=B3co?= =?UTF-8?q?nfig.yaml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/config.py | 5 ++++- .../api_accuracy_checker/common/utils.py | 3 --- .../accuracy_tools/api_accuracy_checker/config.yaml | 5 ++++- .../api_accuracy_checker/run_ut/run_ut.py | 12 ++++++------ 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py index 07dd4e6bfca..066abcbfa7d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py @@ -19,7 +19,10 @@ class Config: 'dump_step': int, 'error_data_path': str, 'enable_dataloader': bool, - 'target_iter': int + 'target_iter': int, + 'url': str, + 'gpu_id': int, + 'npu_id': int } if not isinstance(value, validators[key]): raise ValueError(f"{key} must be {validators[key].__name__} type") diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 04bd565ca8e..59ce6b6f839 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -101,9 +101,6 @@ class Const: "int32_to_int64": ["cross_entropy"] } - # gpu remote - URL: str = "http://10.175.118.91:8081" - GPU_ID: int = 1 class CompareConst: """ diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml index 46f0ed8d41a..fc6a15f5551 100644 --- a/debug/accuracy_tools/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml @@ -6,4 +6,7 @@ real_data: False dump_step: 1000 error_data_path: './' enable_dataloader: True -target_iter: 1 \ No newline at end of file +target_iter: 1 +url: 'http://10.175.118.91:8081' +gpu_id: 0 +npu_id: 0 \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index e77881bf544..c908df5eae8 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -72,9 +72,9 @@ def generate_device_params(input_args, input_kwargs, need_backward, device_id): else: return arg_in - npu_args = recursive_arg_to_device(input_args) - npu_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} - return npu_args, npu_kwargs + device_args = recursive_arg_to_device(input_args) + device_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} + return device_args, device_kwargs def generate_cpu_params(input_args, input_kwargs, need_backward): @@ -153,7 +153,7 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if inplace or not need_grad: print_warn_log("%s involves in-place operations, skip backward" % api_full_name) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward) - npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, "npu:0") + npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, f"npu:{msCheckerConfig.npu_id}") grad_input_index = api_setting_dict.get(api_name) grad_index = None grad, cpu_grad = None, None @@ -194,10 +194,10 @@ def store_testcase(input_data, store_path): def run_gpu_remote(api_full_name, input_file): with open(input_file, 'rb') as file1: files = {"up_input_file": file1} - data = {"api_name": api_full_name, "gpu_id": Const.GPU_ID} + data = {"api_name": api_full_name, "gpu_id": msCheckerConfig.gpu_id} print_info_log(f"start upload {input_file}") try: - result = requests.post(url=urljoin(Const.URL, "run-api"), files=files, data=data).content + result = requests.post(url=urljoin(msCheckerConfig.url, "run-api"), files=files, data=data).content res = pickle.loads(result) gpu_out, gpu_grad_out = res['out'], res['grad_out'] return gpu_out, gpu_grad_out -- Gitee From 7b99d0ffa04ea57b3a086dffbc56ab16c0e64469 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Mon, 11 Sep 2023 21:55:50 +0800 Subject: [PATCH 10/15] =?UTF-8?q?=E9=A2=84=E6=A3=80=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=92=8Cgpu=E8=BF=9B=E8=A1=8C=E6=AF=94?= =?UTF-8?q?=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/utils.py | 21 +++- .../api_accuracy_checker/gpu/server.py | 32 +++++ .../api_accuracy_checker/gpu/service.py | 50 ++++++++ .../api_accuracy_checker/run_ut/run_ut.py | 110 +++++++++++------- 4 files changed, 172 insertions(+), 41 deletions(-) create mode 100644 debug/accuracy_tools/api_accuracy_checker/gpu/server.py create mode 100644 debug/accuracy_tools/api_accuracy_checker/gpu/service.py diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 9304a9ca4fe..04bd565ca8e 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -17,6 +17,7 @@ import collections import json import os +import pickle import random import re import stat @@ -100,6 +101,10 @@ class Const: "int32_to_int64": ["cross_entropy"] } + # gpu remote + URL: str = "http://10.175.118.91:8081" + GPU_ID: int = 1 + class CompareConst: """ Class for compare module const @@ -601,6 +606,7 @@ def cross_entropy_process(api_info_dict): api_info_dict['args'][1]['Min'] = 0 #The second argument in cross_entropy should be -100 or not less than 0. return api_info_dict + def initialize_save_path(save_path, dir_name): data_path = os.path.join(save_path, dir_name) if os.path.exists(data_path): @@ -609,9 +615,22 @@ def initialize_save_path(save_path, dir_name): os.mkdir(data_path, mode = 0o750) check_file_or_directory_path(data_path, True) + def write_pt(file_path, tensor): if os.path.exists(file_path): raise ValueError(f"File {file_path} already exists") torch.save(tensor, file_path) full_path = os.path.abspath(file_path) - return full_path \ No newline at end of file + return full_path + + +def load_file(file): + if not file.exists(): + raise FileNotFoundError + data = None + if file.suffix == ".pt": + data = torch.load(file) + elif file.suffix == ".p" or file.suffix == ".pickle": + with open(file, 'rb') as f: + data = pickle.load(f) + return data diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py new file mode 100644 index 00000000000..5f20ecb1561 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -0,0 +1,32 @@ +import tempfile + +import uvicorn +from fastapi import Form, FastAPI, UploadFile, File, status +from fastapi.responses import JSONResponse, PlainTextResponse + +from api_accuracy_checker.gpu.service import load_and_run_api, accept_store_temp +from api_accuracy_checker.common.utils import print_info_log, print_error_log + +app = FastAPI() + + +@app.post("/run-api") +def run_api_by_file(api_name, up_input_file, gpu_id): + with tempfile.TemporaryDirectory() as temp_dir: + try: + _input_file = accept_store_temp(temp_dir, [up_input_file]) + print_info_log.info(f"Test {api_name}: input: {_input_file}") + res = load_and_run_api(_input_file, f"cuda:{gpu_id}") + except Exception as e: + print_error_log(e) + return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": str(e)}) + return PlainTextResponse(content=res) + + +@app.get("/hello") +def hello_world(): + return {"result": "hello"} + + +if __name__ == '__main__': + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py new file mode 100644 index 00000000000..849a0c7f668 --- /dev/null +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -0,0 +1,50 @@ +import pickle + +import torch +from pathlib import Path + +import api_accuracy_checker.common.utils as utils +from api_accuracy_checker.run_ut.run_ut import exec_api, generate_device_params + + +def accept_store_temp(temp_dir, file_list): + file_path_list = [] + for index, file in enumerate(file_list): + _file = Path(temp_dir).joinpath(f"{index}_{file.filename}").resolve() + with open(_file, 'wb') as fw: + fw.write(file.file.read()) + file_path_list.append(_file) + return file_path_list + + +def load_and_run_api(input_file, device): + data = utils.load_file(input_file) + gpu_args, gpu_kwargs = generate_device_params(data['args'], data['kwargs'], True, device) + gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) + out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) + out = out.cpu() + res = {'out': out, 'grad_out': grad_out} + return pickle.dumps(res) + + +def run_gpu(api_type, api_name, gpu_args, gpu_kwargs, grad, grad_index): + out = exec_api(api_type, api_name, gpu_args, gpu_kwargs) + grad_out = None + if grad is not None: + grad_out = run_backward_gpu(gpu_args, grad, grad_index, out) + return out, grad_out + + +def run_backward_gpu(args, grad, grad_index, out): + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + return grad_out diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 26c2f1c4601..5003e123e53 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,8 +1,19 @@ import argparse +import hashlib +import json import os import copy +import pickle import sys -import torch_npu +from urllib.parse import urljoin + +import requests +try: + import torch_npu +except ImportError: + IS_GPU = True +else: + IS_GPU = False import yaml import torch from tqdm import tqdm @@ -13,7 +24,7 @@ from api_accuracy_checker.compare.compare import Comparator from api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate from api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate -from ut_api_info import UtAPIInfo +from api_accuracy_checker.run_ut.ut_api_info import UtAPIInfo from api_accuracy_checker.common.config import msCheckerConfig NO_GRAD_APIS = ["hardtanh"] @@ -45,26 +56,27 @@ def exec_api(api_type, api_name, args, kwargs): return out -def generate_npu_params(input_args, input_kwargs, need_backward): - def recursive_arg_to_npu(arg_in): +def generate_device_params(input_args, input_kwargs, need_backward, device_id): + def recursive_arg_to_device(arg_in): if isinstance(arg_in, (list, tuple)): - return type(arg_in)(recursive_arg_to_npu(arg) for arg in arg_in) + return type(arg_in)(recursive_arg_to_device(arg) for arg in arg_in) elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: - arg_in = arg_in.clone().detach().to("npu").requires_grad_() + arg_in = arg_in.clone().detach().to(device_id).requires_grad_() temp_arg_in = arg_in * 1 arg_in = temp_arg_in.type_as(arg_in) arg_in.retain_grad() return arg_in else: - return arg_in.clone().detach().to("npu") + return arg_in.clone().detach().to(device_id) else: return arg_in - npu_args = recursive_arg_to_npu(input_args) - npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in input_kwargs.items()} + npu_args = recursive_arg_to_device(input_args) + npu_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} return npu_args, npu_kwargs + def generate_cpu_params(input_args, input_kwargs, need_backward): def recursive_arg_to_cpu(arg_in): if isinstance(arg_in, (list, tuple)): @@ -72,7 +84,8 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): elif isinstance(arg_in, torch.Tensor): if need_backward and arg_in.requires_grad: if str(arg_in.dtype) in Const.RAISE_PRECISION.keys(): - arg_in = arg_in.clone().type(eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() + arg_in = arg_in.clone().type( + eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_() else: arg_in = arg_in.clone().detach().requires_grad_() temp_arg_in = arg_in * 1 @@ -90,6 +103,7 @@ def generate_cpu_params(input_args, input_kwargs, need_backward): cpu_kwargs = {key: recursive_arg_to_cpu(value) for key, value in input_kwargs.items()} return cpu_args, cpu_kwargs + def run_ut(forward_file, backward_file, out_path, save_error_data): print_info_log("start UT test") forward_content = get_json_contents(forward_file) @@ -129,7 +143,6 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) UtAPIInfo(api_full_name + '.backward.output.npu', data_info.npu_grad_out) - def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): in_fwd_data_list = [] [api_type, api_name, _] = api_full_name.split("*") @@ -141,24 +154,54 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if inplace or not need_grad: print_warn_log("%s involves in-place operations, skip backward" % api_full_name) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward) - npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) - grad_out, npu_grad_out = None, None - if kwargs.get("device"): - del kwargs["device"] - out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) - npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, "npu:0") grad_input_index = api_setting_dict.get(api_name) grad_index = None - grad = None + grad, cpu_grad = None, None if grad_input_index is not None: - grad_index = grad_input_index.get('grad_index') + grad_index = grad_input_index.get("grad_index") + if need_backward: + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + cpu_grad, _ = generate_cpu_params(grad, {}, False) + + transfer_input_data = {"args": cpu_args, "kwargs": cpu_kwargs, "grad": cpu_grad, "grad_index": grad_index, + "api_type": api_type, "api_name": api_name} + store_file = store_testcase(transfer_input_data, "inputs") + grad_out, npu_grad_out = None, None + if kwargs.get("device"): + del kwargs["device"] + # run torch api in npu + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) if need_backward: - grad_out, npu_grad_out, grad, npu_grad = run_backward(api_full_name, cpu_args, backward_content, grad_index, npu_args, - npu_out, out) - if grad_index is not None: - return UtDataInfo(grad_out, npu_grad_out, npu_out[grad_index], out[grad_index], grad, in_fwd_data_list) - return UtDataInfo(grad_out, npu_grad_out, npu_out, out, grad, in_fwd_data_list) + npu_grad_out = run_backward_npu(grad, grad_index, npu_args, npu_out) + + gpu_out, gpu_grad_out = run_gpu_remote(api_full_name, store_file) + + return UtDataInfo(gpu_grad_out, npu_grad_out, npu_out, gpu_out, grad, in_fwd_data_list) + + +def store_testcase(input_data, store_path): + fname = hashlib.sha1(str(input_data).encode('utf-8')).hexdigest() + '.p' + store_file = os.path.join(store_path, fname) + with open(store_file, 'wb') as f: + pickle.dump(input_data, f) + return store_file + + +def run_gpu_remote(api_full_name, input_file): + with open(input_file, 'rb') as file1: + files = {"up_input_file": file1} + data = {"api_name": api_full_name, "gpu_id": Const.GPU_ID} + print_info_log(f"start upload {input_file}") + try: + result = requests.post(url=urljoin(Const.URL, "run-api"), files=files, data=data).content + res = pickle.loads(result) + gpu_out, gpu_grad_out = res['out'], res['grad_out'] + return gpu_out, gpu_grad_out + except Exception as e: + print_error_log("Can't get remote result") def get_api_info(api_info_dict, api_name): @@ -173,21 +216,7 @@ def get_api_info(api_info_dict, api_name): return args, inplace, kwargs, need_grad -def run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out): - backward_args = backward_content[api_full_name] - grad = gen_args(backward_args)[0] - cpu_grad, _ = generate_cpu_params(grad, {}, False) - if grad_index is not None: - out[grad_index].backward(cpu_grad) - elif isinstance(out, (list, tuple)): - raise NotImplementedError("Multiple backward is not supported.") - else: - out.backward(cpu_grad) - args_grad = [] - for arg in args: - if isinstance(arg, torch.Tensor): - args_grad.append(arg.grad) - grad_out = args_grad +def run_backward_npu(grad, grad_index, npu_args, npu_out): npu_grad = grad.clone().detach().npu() if grad_index is not None: npu_out[grad_index].backward(npu_grad) @@ -198,7 +227,7 @@ def run_backward(api_full_name, args, backward_content, grad_index, npu_args, np if isinstance(arg, torch.Tensor): npu_args_grad.append(arg.grad) npu_grad_out = npu_args_grad - return grad_out, npu_grad_out, grad, npu_grad + return npu_grad_out def initialize_save_error_data(): @@ -258,6 +287,7 @@ class UtDataInfo: self.grad_in = grad_in self.in_fwd_data_list = in_fwd_data_list + if __name__ == '__main__': _run_ut() print_info_log("UT task completed.") -- Gitee From 15b5cf44933613096f08f91ce1899e374f59be04 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 11:28:18 +0800 Subject: [PATCH 11/15] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dgpu=E6=AF=94=E5=AF=B9?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/server.py | 3 +-- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 1 - debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py index 5f20ecb1561..c9022e379f3 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -18,8 +18,7 @@ def run_api_by_file(api_name, up_input_file, gpu_id): print_info_log.info(f"Test {api_name}: input: {_input_file}") res = load_and_run_api(_input_file, f"cuda:{gpu_id}") except Exception as e: - print_error_log(e) - return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": str(e)}) + print_error_log("Http 500 Internal Server Error.") return PlainTextResponse(content=res) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index 849a0c7f668..10832577af6 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -22,7 +22,6 @@ def load_and_run_api(input_file, device): gpu_args, gpu_kwargs = generate_device_params(data['args'], data['kwargs'], True, device) gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) - out = out.cpu() res = {'out': out, 'grad_out': grad_out} return pickle.dumps(res) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 5003e123e53..a8a74d79c49 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -201,7 +201,7 @@ def run_gpu_remote(api_full_name, input_file): gpu_out, gpu_grad_out = res['out'], res['grad_out'] return gpu_out, gpu_grad_out except Exception as e: - print_error_log("Can't get remote result") + print_error_log("Can't get remote result.") def get_api_info(api_info_dict, api_name): -- Gitee From 1fa5084302a4808386a4cd9858b46f939c58ad42 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 11:59:45 +0800 Subject: [PATCH 12/15] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E8=A7=A3=E9=87=8A=E5=99=A8=EF=BC=8C=E7=BC=96=E7=A0=81=E5=92=8C?= =?UTF-8?q?=E7=89=88=E6=9D=83=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/server.py | 8 ++++++-- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py index c9022e379f3..5d21ab1c888 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -1,3 +1,7 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + import tempfile import uvicorn @@ -14,8 +18,8 @@ app = FastAPI() def run_api_by_file(api_name, up_input_file, gpu_id): with tempfile.TemporaryDirectory() as temp_dir: try: - _input_file = accept_store_temp(temp_dir, [up_input_file]) - print_info_log.info(f"Test {api_name}: input: {_input_file}") + _input_file = accept_store_temp(temp_dir, [up_input_file])[0] + print_info_log(f"Test {api_name}: input: {_input_file}") res = load_and_run_api(_input_file, f"cuda:{gpu_id}") except Exception as e: print_error_log("Http 500 Internal Server Error.") diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index 10832577af6..e23e27791ff 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -1,3 +1,7 @@ +# !/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + import pickle import torch -- Gitee From fc975bdcbe5a22a865d076da93c876a7c5098de4 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 14:32:17 +0800 Subject: [PATCH 13/15] =?UTF-8?q?=E4=BF=AE=E6=94=B9dumps=E5=8D=8F=E8=AE=AE?= =?UTF-8?q?=EF=BC=8C=E8=83=BD=E5=A4=9Fdump=E5=A4=A7=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/gpu/service.py | 2 +- debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py index e23e27791ff..35f0e17790b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/service.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/service.py @@ -27,7 +27,7 @@ def load_and_run_api(input_file, device): gpu_grad, _ = generate_device_params(data['grad'], {}, False, device) out, grad_out = run_gpu(data['api_type'], data['api_name'], gpu_args, gpu_kwargs, gpu_grad, data['grad_index']) res = {'out': out, 'grad_out': grad_out} - return pickle.dumps(res) + return pickle.dumps(res, protocol=4) def run_gpu(api_type, api_name, gpu_args, gpu_kwargs, grad, grad_index): diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index a8a74d79c49..7ab5eb8296c 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -177,7 +177,9 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if need_backward: npu_grad_out = run_backward_npu(grad, grad_index, npu_args, npu_out) + # run torch api in remote gpu gpu_out, gpu_grad_out = run_gpu_remote(api_full_name, store_file) + os.remove(store_file) return UtDataInfo(gpu_grad_out, npu_grad_out, npu_out, gpu_out, grad, in_fwd_data_list) -- Gitee From 1e3809c1732ca002568821a8b0f372225d68a35e Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 12 Sep 2023 17:31:02 +0800 Subject: [PATCH 14/15] =?UTF-8?q?gpu=E9=85=8D=E7=BD=AE=E7=A7=BB=E8=87=B3co?= =?UTF-8?q?nfig.yaml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/config.py | 5 ++++- .../api_accuracy_checker/common/utils.py | 3 --- .../accuracy_tools/api_accuracy_checker/config.yaml | 5 ++++- .../api_accuracy_checker/run_ut/run_ut.py | 12 ++++++------ 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py index 07dd4e6bfca..066abcbfa7d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py @@ -19,7 +19,10 @@ class Config: 'dump_step': int, 'error_data_path': str, 'enable_dataloader': bool, - 'target_iter': int + 'target_iter': int, + 'url': str, + 'gpu_id': int, + 'npu_id': int } if not isinstance(value, validators[key]): raise ValueError(f"{key} must be {validators[key].__name__} type") diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 04bd565ca8e..59ce6b6f839 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -101,9 +101,6 @@ class Const: "int32_to_int64": ["cross_entropy"] } - # gpu remote - URL: str = "http://10.175.118.91:8081" - GPU_ID: int = 1 class CompareConst: """ diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml index 46f0ed8d41a..fc6a15f5551 100644 --- a/debug/accuracy_tools/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml @@ -6,4 +6,7 @@ real_data: False dump_step: 1000 error_data_path: './' enable_dataloader: True -target_iter: 1 \ No newline at end of file +target_iter: 1 +url: 'http://10.175.118.91:8081' +gpu_id: 0 +npu_id: 0 \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 7ab5eb8296c..78fe8c1481e 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -72,9 +72,9 @@ def generate_device_params(input_args, input_kwargs, need_backward, device_id): else: return arg_in - npu_args = recursive_arg_to_device(input_args) - npu_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} - return npu_args, npu_kwargs + device_args = recursive_arg_to_device(input_args) + device_kwargs = {key: recursive_arg_to_device(value) for key, value in input_kwargs.items()} + return device_args, device_kwargs def generate_cpu_params(input_args, input_kwargs, need_backward): @@ -154,7 +154,7 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di if inplace or not need_grad: print_warn_log("%s involves in-place operations, skip backward" % api_full_name) cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward) - npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, "npu:0") + npu_args, npu_kwargs = generate_device_params(args, kwargs, need_backward, f"npu:{msCheckerConfig.npu_id}") grad_input_index = api_setting_dict.get(api_name) grad_index = None grad, cpu_grad = None, None @@ -195,10 +195,10 @@ def store_testcase(input_data, store_path): def run_gpu_remote(api_full_name, input_file): with open(input_file, 'rb') as file1: files = {"up_input_file": file1} - data = {"api_name": api_full_name, "gpu_id": Const.GPU_ID} + data = {"api_name": api_full_name, "gpu_id": msCheckerConfig.gpu_id} print_info_log(f"start upload {input_file}") try: - result = requests.post(url=urljoin(Const.URL, "run-api"), files=files, data=data).content + result = requests.post(url=urljoin(msCheckerConfig.url, "run-api"), files=files, data=data).content res = pickle.loads(result) gpu_out, gpu_grad_out = res['out'], res['grad_out'] return gpu_out, gpu_grad_out -- Gitee From 62aeb2608a4b4a67382185386301ef274ded4fc4 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Sat, 16 Sep 2023 16:00:11 +0800 Subject: [PATCH 15/15] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9url=E7=9A=84?= =?UTF-8?q?=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/api_accuracy_checker/common/config.py | 3 +++ debug/accuracy_tools/api_accuracy_checker/config.yaml | 2 +- debug/accuracy_tools/api_accuracy_checker/gpu/server.py | 7 ++++--- debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py | 5 ++++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py index 066abcbfa7d..e9882fa1f72 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py @@ -1,5 +1,6 @@ import yaml import os +import re from api_accuracy_checker.common.utils import check_file_or_directory_path class Config: @@ -28,6 +29,8 @@ class Config: raise ValueError(f"{key} must be {validators[key].__name__} type") if key == 'target_iter' and value < 0: raise ValueError("target_iter must be greater than 0") + if key == 'url' and re.match(r'^http:\/\/(?:[0-9]{1,3}\.){3}[0-9]{1,3}(:[0-9]{1,5})$', value) is None: + raise ValueError("The format of the url is wrong") return value def __getattr__(self, item): diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml index fc6a15f5551..beb31b28379 100644 --- a/debug/accuracy_tools/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml @@ -7,6 +7,6 @@ dump_step: 1000 error_data_path: './' enable_dataloader: True target_iter: 1 -url: 'http://10.175.118.91:8081' +url: 'http://xx.xx.xx.xx:xxxx' gpu_id: 0 npu_id: 0 \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py index 5d21ab1c888..564a9d7776b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/gpu/server.py +++ b/debug/accuracy_tools/api_accuracy_checker/gpu/server.py @@ -5,8 +5,8 @@ import tempfile import uvicorn -from fastapi import Form, FastAPI, UploadFile, File, status -from fastapi.responses import JSONResponse, PlainTextResponse +from fastapi import Form, FastAPI, UploadFile, File +from fastapi.responses import PlainTextResponse from api_accuracy_checker.gpu.service import load_and_run_api, accept_store_temp from api_accuracy_checker.common.utils import print_info_log, print_error_log @@ -15,7 +15,8 @@ app = FastAPI() @app.post("/run-api") -def run_api_by_file(api_name, up_input_file, gpu_id): +def run_api_by_file(api_name: str = Form(description="api name"), up_input_file: UploadFile = File(), + gpu_id: int = Form(default=0, description="gpu id")): with tempfile.TemporaryDirectory() as temp_dir: try: _input_file = accept_store_temp(temp_dir, [up_input_file])[0] diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index c908df5eae8..ad0131ff9f1 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -8,6 +8,8 @@ import sys from urllib.parse import urljoin import requests +from requests import RequestException + try: import torch_npu except ImportError: @@ -142,6 +144,7 @@ def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) UtAPIInfo(api_full_name + '.backward.output.bench', data_info.bench_grad_out) UtAPIInfo(api_full_name + '.backward.output.npu', data_info.npu_grad_out) + def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_dict): in_fwd_data_list = [] [api_type, api_name, _] = api_full_name.split("*") @@ -201,7 +204,7 @@ def run_gpu_remote(api_full_name, input_file): res = pickle.loads(result) gpu_out, gpu_grad_out = res['out'], res['grad_out'] return gpu_out, gpu_grad_out - except Exception as e: + except RequestException: print_error_log("Can't get remote result.") -- Gitee