From 6afdc0ba63dbe48d4a804b7520e734b82f1bfed9 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 1 Aug 2023 12:52:40 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E9=A2=84=E6=A3=80=E5=B7=A5=E5=85=B7run=5Fu?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/run_ut/run_ut.py | 240 +++++++++++------- 1 file changed, 147 insertions(+), 93 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 79e6ed365..b502ee21e 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -1,93 +1,147 @@ -# 用户构造并运行api用例,注意前反向的区分 -import yaml -import os -import json -import torch - -FLOAT_TYPE = ['torch.float32', 'torch.float', 'torch.float64', 'torch.double', 'torch.float16', \ - 'torch.half', 'torch.bfloat16'] - -cur_path = os.path.dirname(os.path.realpath(__file__)) -yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") -with open(yaml_path, 'r') as f: - WrapFunctionalOps = yaml.safe_load(f).get('functional') - -for f in dir(torch.nn.functional): - locals().update({f: getattr(torch.nn.functional, f)}) - - -def run_ut(): - print("start") - forward_pkl = open("/home/wangchao/torch_test/dump_data_new/npu/ptdbg_dump_v1.0/rank0/dump.pkl") - backward_pkl = open("/home/wangchao/torch_test/dump_data_new/npu/ptdbg_dump_v1.0/rank0/dump_backward.pkl") - forward_content = forward_pkl.readlines() - backward_content = backward_pkl.readlines() - for api_info in forward_content: - api_json = json.loads(api_info) - for key, value in api_json.items(): - [api_type, api_name, index, mode] = key.split("*") - print(api_name) - api_feature = key.rsplit("*", 1)[0] - args, kwargs = generate_input(value.get("args"), api_json.get("kwargs")) - if api_type == "Functional": - out = eval(api_name)(*args, **kwargs) - if api_type == "Tensor": - out = getattr(torch._C._TensorBase, str(api_name))(*args, **kwargs) - for line in backward_content: - if api_feature in line: - api_back_json = json.loads(line) - for params in api_back_json.values(): - grad = nested_generate_input(params.get("args"), True, False) - out.backward(grad) - input_grad = [tensor.grad for tensor in args if isinstance(tensor, torch.Tensor)] - print("forward") - print(out) - print("backward") - print(input_grad) - - -def generate_input(input_args, input_kwargs, need_backward=True, need_convert=False): # 没有考虑dict of tensor - args = [] - kwargs = {} - - for info in input_args: - args.append(nested_generate_input(info, need_backward, need_convert)) - if kwargs: - for key, info in input_kwargs.items(): - kwargs[key] = nested_generate_input(info, need_backward, need_convert) - return args, kwargs - - -def nested_generate_input(info, need_backward, need_convert): - if isinstance(info, list): - result = [] - for i in info: - result.append(nested_generate_input(i, need_backward, need_convert)) - return result - # return list(map(nested_generate_input, info)) - # elif isinstance(input_info, tuple): - # return tuple(map(generate_input, input_info)) - else: - if info['type'] == 'torch.Tensor': - low, high = info['Min'], info['Max'] - data_dtype = info['dtype'] - if data_dtype in FLOAT_TYPE: #应该搞个float类型列表 - if need_convert and data_dtype == "torch.float16": - data_dtype = "torch.float32" - scale = high - low - rand01 = torch.rand(tuple(info['shape']), dtype=eval(data_dtype)) - inpt = rand01 * scale + low - if need_backward: - inpt.requires_grad_(True) - inpt.retain_grad() - elif 'int' in data_dtype or 'long' in data_dtype: # 应该搞个int类型列表, - inpt = torch.randint(int(low), int(high)+1, tuple(info['shape']), - dtype=eval(data_dtype)) # high + 1因为右边是开区间 - else: - print(f'Warning: Dtype is not supported: ', info['dtype']) - raise NotImplementedError() - else: - inpt = info['value'] # 遗留问题:需要考虑是否要转换成原本类型 - return inpt - -run_ut() \ No newline at end of file +import argparse +import os +import sys +sys.path.append("..") +import yaml +import json +import torch +from data_generate import gen_api_params, gen_args +from common.utils import print_info_log +from compare.compare import Comparator + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "../hook_module/support_wrap_ops.yaml") +with open(yaml_path, 'r') as f: + WrapFunctionalOps = yaml.safe_load(f).get('functional') + +for f in dir(torch.nn.functional): + if f != "__name__": + locals().update({f: getattr(torch.nn.functional, f)}) + + +def exec_api(api_type, api_name, args, kwargs): + if api_type == "Functional": + out = eval(api_name)(*args, **kwargs) + if api_type == "Tensor": + out = getattr(torch._C._TensorBase, str(api_name))(*args, **kwargs) + if api_type == "Torch": + out = getattr(torch._C._VariableFunctionsClass, str(api_name))(*args, **kwargs) + return out + + +def generate_npu_params(cpu_args, cpu_kwargs, need_backward): + npu_args = [] + if need_backward: + for arg_in in cpu_args: + if isinstance(arg_in, torch.Tensor) and arg_in.dtype in [torch.float, torch.float16, + torch.float64] and arg_in.requires_grad: + arg_in = arg_in.clone().detach().to("npu").requires_grad_() + elif isinstance(arg_in, torch.Tensor): + arg_in = arg_in.clone().detach().to("npu") + npu_args.append(arg_in) + else: + for arg_in in cpu_args: + if isinstance(arg_in, torch.Tensor): + arg_in = arg_in.clone().detach().to("npu") + npu_args.append(arg_in) + return npu_args, cpu_kwargs + + +def run_ut(forward_file, backward_file, out_path, save_error_data, compare_alg): + print_info_log("start UT test") + with open(forward_file, "r") as file_handler: + forward_content = json.load(file_handler) + with open(backward_file, "r") as file_handler: + backward_content = json.load(file_handler) + with open("torch_ut_setting.json", 'r') as file_handler: + api_setting_dict = json.load(file_handler) + compare = Comparator(out_path) + for key, value in forward_content.items(): + [api_type, api_name, index] = key.split("*") + api_feature = key + args, kwargs = gen_api_params(value, api_name[-1] != "_") + inplace = kwargs.get("inplace") if kwargs.get("inplace") else None + need_backward = api_feature in backward_content and api_name[-1] != "_" and inplace is not True + npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) + grad_out, npu_grad_out = None, None + + # run cpu + out = exec_api(api_type, api_name, args, kwargs) + + # run npu + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + + grad_input_index = api_setting_dict.get(api_name) + grad_index = None + if grad_input_index is not None: + grad_index = grad_input_index.get('grad_index') + + # backward + if need_backward: + backward_args = backward_content[api_feature] + grad = gen_args(backward_args)[0] + if grad_index is not None: + out[grad_index].backward(grad) + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + + npu_grad = grad.clone().detach().npu() + if grad_index is not None: + npu_out[grad_index].backward(npu_grad) + else: + npu_out.backward(npu_grad) + npu_args_grad = [] + for arg in npu_args: + if isinstance(arg, torch.Tensor): + npu_args_grad.append(arg.grad) + npu_grad_out = npu_args_grad + compare.compare_output(api_feature, out, npu_out, grad_out, npu_grad_out) + + compare.print_pretest_result() + compare.write_compare_csv() + + +def _run_ut_parser(parser): + parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", + help=" The api param tool forward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-backward", "--backward_input_file", dest="backward_input_file", default="", + help=" The api param tool backward result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-o", "--out_path", dest="out_path", default="", + help=" The ut task result out path.", + required=False) + parser.add_argument('-save_error_data', dest="save_error_data", action="store_true", + help=" Save compare failed api output.", required=False) + parser.add_argument("-alg", "--compare_alg", dest="compare_alg", choices=['cosine', 'relative_error'], + default='cosine', + help=" The tensor compare algorithm, default is cosine.", + required=False) + parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", + default=True, required=False) + + +def _run_ut(): + parser = argparse.ArgumentParser() + _run_ut_parser(parser) + args = parser.parse_args(sys.argv[1:]) + if not args.jit_compile: + torch.npu.set_compile_mode(jit_compile=False) + forward_file = os.path.realpath(args.forward_input_file) + backward_file = os.path.realpath(args.backward_input_file) + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + save_error_data = args.save_error_data + compare_alg = args.compare_alg + run_ut(forward_file, backward_file, out_path, save_error_data, compare_alg) + + +if __name__ == '__main__': + _run_ut() + print_info_log("UT task completed.") -- Gitee From 91f57b798db9482cb004dc5b6b53871294e6c7e6 Mon Sep 17 00:00:00 2001 From: h00613304 Date: Tue, 1 Aug 2023 16:14:06 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E9=A2=84=E6=A3=80=E5=B7=A5=E5=85=B7run=5Fu?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api_accuracy_checker/common/utils.py | 955 +++++++++--------- .../api_accuracy_checker/run_ut/run_ut.py | 109 +- 2 files changed, 546 insertions(+), 518 deletions(-) diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 24959a629..76cedca53 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -1,458 +1,497 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -import collections -import os -import random -import re -import stat -import subprocess -import sys -import time -from datetime import datetime, timezone - -import numpy as np -import torch - -try: - import torch_npu -except ImportError: - IS_GPU = True -else: - IS_GPU = False - -if not IS_GPU: - from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard - -device = collections.namedtuple('device', ['type', 'index']) - - -class Const: - """ - Class for const - """ - MODEL_TYPE = ['.onnx', '.pb', '.om'] - DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*" - SEMICOLON = ";" - COLON = ":" - EQUAL = "=" - COMMA = "," - DOT = "." - DUMP_RATIO_MAX = 100 - SUMMERY_DATA_NUMS = 256 - ONE_HUNDRED_MB = 100*1024*1024 - FLOAT_EPSILON = np.finfo(float).eps - SUPPORT_DUMP_MODE = ['api', 'acl'] - ON = 'ON' - OFF = 'OFF' - BACKWARD = 'backward' - FORWARD = 'forward' - - # dump mode - ALL = "all" - LIST = "list" - RANGE = "range" - STACK = "stack" - ACL = "acl" - API_LIST = "api_list" - API_STACK = "api_stack" - DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK] - - API_PATTERN = r"^[A-Za-z0-9]+[_]+([A-Za-z0-9]+[_]*[A-Za-z0-9]+)[_]+[0-9]+[_]+[A-Za-z0-9]+" - WRITE_FLAGS = os.O_WRONLY | os.O_CREAT - WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR - - -class CompareConst: - """ - Class for compare module const - """ - # compare result column name - NPU_NAME = "NPU Name" - BENCH_NAME = "Bench Name" - NPU_DTYPE = "NPU Tensor Dtype" - BENCH_DTYPE = "Bench Tensor Dtype" - NPU_SHAPE = "NPU Tensor Shape" - BENCH_SHAPE = "Bench Tensor Shape" - NPU_MAX = "NPU max" - NPU_MIN = "NPU min" - NPU_MEAN = "NPU mean" - BENCH_MAX = "Bench max" - BENCH_MIN = "Bench min" - BENCH_MEAN = "Bench mean" - COSINE = "Cosine" - MAX_ABS_ERR = "MaxAbsErr" - ACCURACY = "Accuracy Reached or Not" - STACK = "NPU_Stack_Info" - ERROR_MESSAGE = "Err_message" - - # compare result data - NAN = 'Nan' - SHAPE_UNMATCH = 'shape unmatched' - DTYPE_UNMATCH = 'dtype unmatched' - - # accuracy standards - COS_THRESHOLD = 0.99 - MAX_ABS_ERR_THRESHOLD = 0.001 - COS_MAX_THRESHOLD = 0.9 - MAX_ABS_ERR_MAX_THRESHOLD = 1 - ACCURACY_CHECK_YES = "Yes" - ACCURACY_CHECK_NO = "No" - ACCURACY_CHECK_UNMATCH = "Unmatched" - - # error message - NO_BENCH = "No bench data matched." - - -class VersionCheck: - """ - Class for TorchVersion - """ - V1_8 = "1.8" - V1_11 = "1.11" - - @staticmethod - def check_torch_version(version): - torch_version = torch.__version__ - if torch_version.startswith(version): - return True - else: - return False - - -class CompareException(Exception): - """ - Class for Accuracy Compare Exception - """ - NONE_ERROR = 0 - INVALID_PATH_ERROR = 1 - OPEN_FILE_ERROR = 2 - CLOSE_FILE_ERROR = 3 - READ_FILE_ERROR = 4 - WRITE_FILE_ERROR = 5 - INVALID_FILE_ERROR = 6 - PERMISSION_ERROR = 7 - INDEX_OUT_OF_BOUNDS_ERROR = 8 - NO_DUMP_FILE_ERROR = 9 - INVALID_DATA_ERROR = 10 - INVALID_PARAM_ERROR = 11 - INVALID_DUMP_RATIO = 12 - INVALID_DUMP_FILE = 13 - UNKNOWN_ERROR = 14 - INVALID_DUMP_MODE = 15 - PARSE_FILE_ERROR = 16 - INVALID_COMPARE_MODE = 17 - - def __init__(self, code, error_info: str = ""): - super(CompareException, self).__init__() - self.code = code - self.error_info = error_info - - def __str__(self): - return self.error_info - -class DumpException(CompareException): - pass - -def _print_log(level, msg): - current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) - pid = os.getgid() - print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg) - sys.stdout.flush() - - -def print_info_log(info_msg): - """ - Function Description: - print info log. - Parameter: - info_msg: the info message. - """ - _print_log("INFO", info_msg) - - -def print_error_log(error_msg): - """ - Function Description: - print error log. - Parameter: - error_msg: the error message. - """ - _print_log("ERROR", error_msg) - - -def print_warn_log(warn_msg): - """ - Function Description: - print warn log. - Parameter: - warn_msg: the warning message. - """ - _print_log("WARNING", warn_msg) - - -def check_mode_valid(mode): - if mode not in Const.DUMP_MODE: - msg = "Current mode '%s' is not supported. Please use the field in %s" % \ - (mode, Const.DUMP_MODE) - raise CompareException(CompareException.INVALID_DUMP_MODE, msg) - - -def check_file_or_directory_path(path, isdir=False): - """ - Function Description: - check whether the path is valid - Parameter: - path: the path to check - isdir: the path is dir or file - Exception Description: - when invalid data throw exception - """ - if isdir: - if not os.path.exists(path): - print_error_log('The path {} is not exist.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.path.isdir(path): - print_error_log('The path {} is not a directory.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.access(path, os.W_OK): - print_error_log( - 'The path {} does not have permission to write. Please check the path permission'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - else: - if not os.path.isfile(path): - print_error_log('{} is an invalid file or non-exist.'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.access(path, os.R_OK): - print_error_log( - 'The path {} does not have permission to read. Please check the path permission'.format(path)) - raise CompareException(CompareException.INVALID_PATH_ERROR) - -def _check_pkl(pkl_file_handle, file_name): - tensor_line = pkl_file_handle.readline() - if len(tensor_line) == 0: - print_error_log("dump file {} have empty line!".format(file_name)) - raise CompareException(CompareException.INVALID_DUMP_FILE) - pkl_file_handle.seek(0, 0) - - -def check_file_mode(npu_pkl, bench_pkl, stack_mode): - npu_pkl_name = os.path.split(npu_pkl)[-1] - bench_pkl_name = os.path.split(bench_pkl)[-1] - - if not npu_pkl_name.startswith("api_stack") and not bench_pkl_name.startswith("api_stack"): - if stack_mode: - print_error_log("The current file does not contain stack information, please turn off the stack_mode") - raise CompareException(CompareException.INVALID_COMPARE_MODE) - elif npu_pkl_name.startswith("api_stack") and bench_pkl_name.startswith("api_stack"): - if not stack_mode: - print_error_log("The current file contains stack information, please turn on the stack_mode") - raise CompareException(CompareException.INVALID_COMPARE_MODE) - else: - print_error_log("The dump mode of the two files is not same, please check the dump files") - raise CompareException(CompareException.INVALID_COMPARE_MODE) - - -def check_file_size(input_file, max_size): - try: - file_size = os.path.getsize(input_file) - except OSError as os_error: - print_error_log('Failed to open "%s". %s' % (input_file, str(os_error))) - raise CompareException(CompareException.INVALID_FILE_ERROR) - if file_size > max_size: - print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.' - % (file_size, input_file, max_size)) - raise CompareException(CompareException.INVALID_FILE_ERROR) - - -def get_dump_data_path(dump_dir): - """ - Function Description: - traverse directories and obtain the absolute path of dump data - Parameter: - dump_dir: dump data directory - Return Value: - dump data path,file is exist or file is not exist - """ - dump_data_path = None - file_is_exist = False - - check_file_or_directory_path(dump_dir, True) - for dir_path, sub_paths, files in os.walk(dump_dir): - if len(files) != 0: - dump_data_path = dir_path - file_is_exist = True - break - dump_data_path = dir_path - return dump_data_path, file_is_exist - - -def get_api_name_from_matcher(name): - api_matcher = re.compile(Const.API_PATTERN) - match = api_matcher.match(name) - return match.group(1) if match else "" - - -def modify_dump_path(dump_path, mode): - if mode == Const.ALL: - return dump_path - file_name = os.path.split(dump_path) - mode_file_name = mode + "_" + file_name[-1] - return os.path.join(file_name[0], mode_file_name) - - -def create_directory(dir_path): - """ - Function Description: - creating a directory with specified permissions - Parameter: - dir_path: directory path - Exception Description: - when invalid data throw exception - """ - if not os.path.exists(dir_path): - try: - os.makedirs(dir_path, mode=0o700) - except OSError as ex: - print_error_log( - 'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex))) - raise CompareException(CompareException.INVALID_PATH_ERROR) - - -def execute_command(cmd): - """ - Function Description: - run the following command - Parameter: - cmd: command - Exception Description: - when invalid command throw exception - """ - print_info_log('Execute command:%s' % cmd) - process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - while process.poll() is None: - line = process.stdout.readline() - line = line.strip() - if line: - print(line) - if process.returncode != 0: - print_error_log('Failed to execute command:%s' % " ".join(cmd)) - raise CompareException(CompareException.INVALID_DATA_ERROR) - - -def save_numpy_data(file_path, data): - """ - save_numpy_data - """ - if not os.path.exists(os.path.dirname(file_path)): - os.makedirs(os.path.dirname(file_path)) - np.save(file_path, data) - - -def parse_arg_value(values): - """ - parse dynamic arg value of atc cmdline - """ - value_list = [] - for item in values.split(Const.SEMICOLON): - value_list.append(parse_value_by_comma(item)) - return value_list - - -def parse_value_by_comma(value): - """ - parse value by comma, like '1,2,4,8' - """ - value_list = [] - value_str_list = value.split(Const.COMMA) - for value_str in value_str_list: - value_str = value_str.strip() - if value_str.isdigit() or value_str == '-1': - value_list.append(int(value_str)) - else: - print_error_log("please check your input shape.") - raise CompareException(CompareException.INVALID_PARAM_ERROR) - return value_list - - -def get_data_len_by_shape(shape): - data_len = 1 - for item in shape: - if item == -1: - print_error_log("please check your input shape, one dim in shape is -1.") - return -1 - data_len = data_len * item - return data_len - - -def add_time_as_suffix(name): - return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) - - -def get_time(): - return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") - - -def format_value(value): - return '{:.6f}'.format(value) - - -def torch_device_guard(func): - if IS_GPU: - return func - # Parse args/kwargs matched torch.device objects - - @torch_npu_device_guard - def wrapper(*args, **kwargs): - return func(*args, **kwargs) - return wrapper - - -def seed_all(seed=1234, mode=False): - random.seed(seed) - os.environ['PYTHONHASHSEED'] = str(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.use_deterministic_algorithms(mode) - if IS_GPU: - torch.cuda.manual_seed_all(seed) - torch.cuda.manual_seed(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.enable = False - torch.backends.cudnn.benchmark = False - else: - torch_npu.npu.manual_seed_all(seed) - torch_npu.npu.manual_seed(seed) - - -def get_process_rank(model): - print_info_log("Rank id is not provided. Trying to get the rank id of the model.") - try: - device = next(model.parameters()).device - except StopIteration: - print_warn_log('There is no parameter in the model. Fail to get rank id.') - return 0, False - if device.type == 'cpu': - print_warn_log("Warning: the debugger is unable to get the rank id. " - "This may cause the dumpped data to be corrupted in the " - "case of distributed training. (You may ignore this if you are using only one card.) " - "Transfer the model to npu or gpu before register_hook() to avoid this warning.") - return 0, False - else: - return device.index, True +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import collections +import json +import os +import random +import re +import stat +import subprocess +import sys +import time +from datetime import datetime, timezone + +import numpy as np +import torch + +try: + import torch_npu +except ImportError: + IS_GPU = True +else: + IS_GPU = False + +if not IS_GPU: + from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard + +device = collections.namedtuple('device', ['type', 'index']) + + +class Const: + """ + Class for const + """ + MODEL_TYPE = ['.onnx', '.pb', '.om'] + DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*" + SEMICOLON = ";" + COLON = ":" + EQUAL = "=" + COMMA = "," + DOT = "." + DUMP_RATIO_MAX = 100 + SUMMERY_DATA_NUMS = 256 + ONE_HUNDRED_MB = 100*1024*1024 + FLOAT_EPSILON = np.finfo(float).eps + SUPPORT_DUMP_MODE = ['api', 'acl'] + ON = 'ON' + OFF = 'OFF' + BACKWARD = 'backward' + FORWARD = 'forward' + + # dump mode + ALL = "all" + LIST = "list" + RANGE = "range" + STACK = "stack" + ACL = "acl" + API_LIST = "api_list" + API_STACK = "api_stack" + DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK] + + API_PATTERN = r"^[A-Za-z0-9]+[_]+([A-Za-z0-9]+[_]*[A-Za-z0-9]+)[_]+[0-9]+[_]+[A-Za-z0-9]+" + WRITE_FLAGS = os.O_WRONLY | os.O_CREAT + WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR + + +class CompareConst: + """ + Class for compare module const + """ + # compare result column name + NPU_NAME = "NPU Name" + BENCH_NAME = "Bench Name" + NPU_DTYPE = "NPU Tensor Dtype" + BENCH_DTYPE = "Bench Tensor Dtype" + NPU_SHAPE = "NPU Tensor Shape" + BENCH_SHAPE = "Bench Tensor Shape" + NPU_MAX = "NPU max" + NPU_MIN = "NPU min" + NPU_MEAN = "NPU mean" + BENCH_MAX = "Bench max" + BENCH_MIN = "Bench min" + BENCH_MEAN = "Bench mean" + COSINE = "Cosine" + MAX_ABS_ERR = "MaxAbsErr" + ACCURACY = "Accuracy Reached or Not" + STACK = "NPU_Stack_Info" + ERROR_MESSAGE = "Err_message" + + # compare result data + NAN = 'Nan' + SHAPE_UNMATCH = 'shape unmatched' + DTYPE_UNMATCH = 'dtype unmatched' + + # accuracy standards + COS_THRESHOLD = 0.99 + MAX_ABS_ERR_THRESHOLD = 0.001 + COS_MAX_THRESHOLD = 0.9 + MAX_ABS_ERR_MAX_THRESHOLD = 1 + ACCURACY_CHECK_YES = "Yes" + ACCURACY_CHECK_NO = "No" + ACCURACY_CHECK_UNMATCH = "Unmatched" + + # error message + NO_BENCH = "No bench data matched." + + +class VersionCheck: + """ + Class for TorchVersion + """ + V1_8 = "1.8" + V1_11 = "1.11" + + @staticmethod + def check_torch_version(version): + torch_version = torch.__version__ + if torch_version.startswith(version): + return True + else: + return False + + +class CompareException(Exception): + """ + Class for Accuracy Compare Exception + """ + NONE_ERROR = 0 + INVALID_PATH_ERROR = 1 + OPEN_FILE_ERROR = 2 + CLOSE_FILE_ERROR = 3 + READ_FILE_ERROR = 4 + WRITE_FILE_ERROR = 5 + INVALID_FILE_ERROR = 6 + PERMISSION_ERROR = 7 + INDEX_OUT_OF_BOUNDS_ERROR = 8 + NO_DUMP_FILE_ERROR = 9 + INVALID_DATA_ERROR = 10 + INVALID_PARAM_ERROR = 11 + INVALID_DUMP_RATIO = 12 + INVALID_DUMP_FILE = 13 + UNKNOWN_ERROR = 14 + INVALID_DUMP_MODE = 15 + PARSE_FILE_ERROR = 16 + INVALID_COMPARE_MODE = 17 + + def __init__(self, code, error_info: str = ""): + super(CompareException, self).__init__() + self.code = code + self.error_info = error_info + + def __str__(self): + return self.error_info + +class DumpException(CompareException): + pass + +def _print_log(level, msg): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getgid() + print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg) + sys.stdout.flush() + + +def print_info_log(info_msg): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) + + +def check_mode_valid(mode): + if mode not in Const.DUMP_MODE: + msg = "Current mode '%s' is not supported. Please use the field in %s" % \ + (mode, Const.DUMP_MODE) + raise CompareException(CompareException.INVALID_DUMP_MODE, msg) + + +def check_file_or_directory_path(path, isdir=False): + """ + Function Description: + check whether the path is valid + Parameter: + path: the path to check + isdir: the path is dir or file + Exception Description: + when invalid data throw exception + """ + if isdir: + if not os.path.exists(path): + print_error_log('The path {} is not exist.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not os.path.isdir(path): + print_error_log('The path {} is not a directory.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not os.access(path, os.W_OK): + print_error_log( + 'The path {} does not have permission to write. Please check the path permission'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + else: + if not os.path.isfile(path): + print_error_log('{} is an invalid file or non-exist.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not os.access(path, os.R_OK): + print_error_log( + 'The path {} does not have permission to read. Please check the path permission'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + +def _check_pkl(pkl_file_handle, file_name): + tensor_line = pkl_file_handle.readline() + if len(tensor_line) == 0: + print_error_log("dump file {} have empty line!".format(file_name)) + raise CompareException(CompareException.INVALID_DUMP_FILE) + pkl_file_handle.seek(0, 0) + + +def check_file_mode(npu_pkl, bench_pkl, stack_mode): + npu_pkl_name = os.path.split(npu_pkl)[-1] + bench_pkl_name = os.path.split(bench_pkl)[-1] + + if not npu_pkl_name.startswith("api_stack") and not bench_pkl_name.startswith("api_stack"): + if stack_mode: + print_error_log("The current file does not contain stack information, please turn off the stack_mode") + raise CompareException(CompareException.INVALID_COMPARE_MODE) + elif npu_pkl_name.startswith("api_stack") and bench_pkl_name.startswith("api_stack"): + if not stack_mode: + print_error_log("The current file contains stack information, please turn on the stack_mode") + raise CompareException(CompareException.INVALID_COMPARE_MODE) + else: + print_error_log("The dump mode of the two files is not same, please check the dump files") + raise CompareException(CompareException.INVALID_COMPARE_MODE) + + +def check_file_size(input_file, max_size): + try: + file_size = os.path.getsize(input_file) + except OSError as os_error: + print_error_log('Failed to open "%s". %s' % (input_file, str(os_error))) + raise CompareException(CompareException.INVALID_FILE_ERROR) + if file_size > max_size: + print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.' + % (file_size, input_file, max_size)) + raise CompareException(CompareException.INVALID_FILE_ERROR) + + +def get_dump_data_path(dump_dir): + """ + Function Description: + traverse directories and obtain the absolute path of dump data + Parameter: + dump_dir: dump data directory + Return Value: + dump data path,file is exist or file is not exist + """ + dump_data_path = None + file_is_exist = False + + check_file_or_directory_path(dump_dir, True) + for dir_path, sub_paths, files in os.walk(dump_dir): + if len(files) != 0: + dump_data_path = dir_path + file_is_exist = True + break + dump_data_path = dir_path + return dump_data_path, file_is_exist + + +def get_api_name_from_matcher(name): + api_matcher = re.compile(Const.API_PATTERN) + match = api_matcher.match(name) + return match.group(1) if match else "" + + +def modify_dump_path(dump_path, mode): + if mode == Const.ALL: + return dump_path + file_name = os.path.split(dump_path) + mode_file_name = mode + "_" + file_name[-1] + return os.path.join(file_name[0], mode_file_name) + + +def create_directory(dir_path): + """ + Function Description: + creating a directory with specified permissions + Parameter: + dir_path: directory path + Exception Description: + when invalid data throw exception + """ + if not os.path.exists(dir_path): + try: + os.makedirs(dir_path, mode=0o700) + except OSError as ex: + print_error_log( + 'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex))) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + +def execute_command(cmd): + """ + Function Description: + run the following command + Parameter: + cmd: command + Exception Description: + when invalid command throw exception + """ + print_info_log('Execute command:%s' % cmd) + process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + while process.poll() is None: + line = process.stdout.readline() + line = line.strip() + if line: + print(line) + if process.returncode != 0: + print_error_log('Failed to execute command:%s' % " ".join(cmd)) + raise CompareException(CompareException.INVALID_DATA_ERROR) + + +def save_numpy_data(file_path, data): + """ + save_numpy_data + """ + if not os.path.exists(os.path.dirname(file_path)): + os.makedirs(os.path.dirname(file_path)) + np.save(file_path, data) + + +def parse_arg_value(values): + """ + parse dynamic arg value of atc cmdline + """ + value_list = [] + for item in values.split(Const.SEMICOLON): + value_list.append(parse_value_by_comma(item)) + return value_list + + +def parse_value_by_comma(value): + """ + parse value by comma, like '1,2,4,8' + """ + value_list = [] + value_str_list = value.split(Const.COMMA) + for value_str in value_str_list: + value_str = value_str.strip() + if value_str.isdigit() or value_str == '-1': + value_list.append(int(value_str)) + else: + print_error_log("please check your input shape.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + return value_list + + +def get_data_len_by_shape(shape): + data_len = 1 + for item in shape: + if item == -1: + print_error_log("please check your input shape, one dim in shape is -1.") + return -1 + data_len = data_len * item + return data_len + + +def add_time_as_suffix(name): + return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) + + +def get_time(): + return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + + +def format_value(value): + return '{:.6f}'.format(value) + + +def torch_device_guard(func): + if IS_GPU: + return func + # Parse args/kwargs matched torch.device objects + + @torch_npu_device_guard + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + return wrapper + + +def seed_all(seed=1234, mode=False): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(mode) + if IS_GPU: + torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enable = False + torch.backends.cudnn.benchmark = False + else: + torch_npu.npu.manual_seed_all(seed) + torch_npu.npu.manual_seed(seed) + + +def get_process_rank(model): + print_info_log("Rank id is not provided. Trying to get the rank id of the model.") + try: + device = next(model.parameters()).device + except StopIteration: + print_warn_log('There is no parameter in the model. Fail to get rank id.') + return 0, False + if device.type == 'cpu': + print_warn_log("Warning: the debugger is unable to get the rank id. " + "This may cause the dumpped data to be corrupted in the " + "case of distributed training. (You may ignore this if you are using only one card.) " + "Transfer the model to npu or gpu before register_hook() to avoid this warning.") + return 0, False + else: + return device.index, True + + +def get_json_contents(file_path): + ops = get_file_content_bytes(file_path) + return json.loads(ops) + + +def get_file_content_bytes(file): + check_input_file_valid(file) + with open(file, 'rb') as file_handle: + return file_handle.read() + + +def islink(path): + path = os.path.abspath(path) + return os.path.islink(path) + + +class SoftlinkCheckException(Exception): + pass + + +MAX_JSON_FILE_SIZE = 10 * 1024 ** 2 + + +def check_input_file_valid(input_path, max_file_size=MAX_JSON_FILE_SIZE): + if islink(input_path): + raise SoftlinkCheckException("Input path doesn't support soft link.") + + input_path = os.path.realpath(input_path) + if not os.path.exists(input_path): + raise ValueError('Input file %s does not exist!' % input_path) + + if not os.access(input_path, os.R_OK): + raise PermissionError('Input file %s is not readable!' % input_path) + + if os.path.getsize(input_path) > max_file_size: + raise ValueError(f'The file is too large, exceeds {max_file_size // 1024 ** 2}MB') diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index b502ee21e..f1d63d207 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -3,10 +3,9 @@ import os import sys sys.path.append("..") import yaml -import json import torch from data_generate import gen_api_params, gen_args -from common.utils import print_info_log +from common.utils import print_info_log, get_json_contents from compare.compare import Comparator cur_path = os.path.dirname(os.path.realpath(__file__)) @@ -47,65 +46,60 @@ def generate_npu_params(cpu_args, cpu_kwargs, need_backward): return npu_args, cpu_kwargs -def run_ut(forward_file, backward_file, out_path, save_error_data, compare_alg): +def run_ut(forward_file, backward_file, out_path, save_error_data): print_info_log("start UT test") - with open(forward_file, "r") as file_handler: - forward_content = json.load(file_handler) - with open(backward_file, "r") as file_handler: - backward_content = json.load(file_handler) - with open("torch_ut_setting.json", 'r') as file_handler: - api_setting_dict = json.load(file_handler) + forward_content = get_json_contents(forward_file) + backward_content = get_json_contents(backward_file) + api_setting_dict = get_json_contents("torch_ut_setting.json") compare = Comparator(out_path) - for key, value in forward_content.items(): - [api_type, api_name, index] = key.split("*") - api_feature = key - args, kwargs = gen_api_params(value, api_name[-1] != "_") - inplace = kwargs.get("inplace") if kwargs.get("inplace") else None - need_backward = api_feature in backward_content and api_name[-1] != "_" and inplace is not True - npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) - grad_out, npu_grad_out = None, None - - # run cpu - out = exec_api(api_type, api_name, args, kwargs) - - # run npu - npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) - - grad_input_index = api_setting_dict.get(api_name) - grad_index = None - if grad_input_index is not None: - grad_index = grad_input_index.get('grad_index') - - # backward - if need_backward: - backward_args = backward_content[api_feature] - grad = gen_args(backward_args)[0] - if grad_index is not None: - out[grad_index].backward(grad) - else: - out.backward(grad) - args_grad = [] - for arg in args: - if isinstance(arg, torch.Tensor): - args_grad.append(arg.grad) - grad_out = args_grad - - npu_grad = grad.clone().detach().npu() - if grad_index is not None: - npu_out[grad_index].backward(npu_grad) - else: - npu_out.backward(npu_grad) - npu_args_grad = [] - for arg in npu_args: - if isinstance(arg, torch.Tensor): - npu_args_grad.append(arg.grad) - npu_grad_out = npu_args_grad - compare.compare_output(api_feature, out, npu_out, grad_out, npu_grad_out) + for api_full_name, value in forward_content.items(): + grad_out, npu_grad_out, npu_out, out = run_torch_api(api_full_name, api_setting_dict, backward_content, value) + compare.compare_output(api_full_name, out, npu_out, grad_out, npu_grad_out) compare.print_pretest_result() compare.write_compare_csv() +def run_torch_api(api_full_name, api_setting_dict, backward_content, value): + [api_type, api_name, _] = api_full_name.split("*") + args, kwargs = gen_api_params(value, api_name[-1] != "_") + inplace = kwargs.get("inplace") if kwargs.get("inplace") else None + need_backward = api_full_name in backward_content and api_name[-1] != "_" and inplace is not True + npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward) + grad_out, npu_grad_out = None, None + out = exec_api(api_type, api_name, args, kwargs) + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + grad_input_index = api_setting_dict.get(api_name) + grad_index = None + if grad_input_index is not None: + grad_index = grad_input_index.get('grad_index') + + if need_backward: + backward_args = backward_content[api_full_name] + grad = gen_args(backward_args)[0] + if grad_index is not None: + out[grad_index].backward(grad) + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + + npu_grad = grad.clone().detach().npu() + if grad_index is not None: + npu_out[grad_index].backward(npu_grad) + else: + npu_out.backward(npu_grad) + npu_args_grad = [] + for arg in npu_args: + if isinstance(arg, torch.Tensor): + npu_args_grad.append(arg.grad) + npu_grad_out = npu_args_grad + return grad_out, npu_grad_out, npu_out, out + + def _run_ut_parser(parser): parser.add_argument("-forward", "--forward_input_file", dest="forward_input_file", default="", help=" The api param tool forward result file: generate from api param tool, " @@ -120,10 +114,6 @@ def _run_ut_parser(parser): required=False) parser.add_argument('-save_error_data', dest="save_error_data", action="store_true", help=" Save compare failed api output.", required=False) - parser.add_argument("-alg", "--compare_alg", dest="compare_alg", choices=['cosine', 'relative_error'], - default='cosine', - help=" The tensor compare algorithm, default is cosine.", - required=False) parser.add_argument("-c", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", default=True, required=False) @@ -138,8 +128,7 @@ def _run_ut(): backward_file = os.path.realpath(args.backward_input_file) out_path = os.path.realpath(args.out_path) if args.out_path else "./" save_error_data = args.save_error_data - compare_alg = args.compare_alg - run_ut(forward_file, backward_file, out_path, save_error_data, compare_alg) + run_ut(forward_file, backward_file, out_path, save_error_data) if __name__ == '__main__': -- Gitee